mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-20 01:05:46 +00:00
Compare commits
1 Commits
pdf_fix
...
testing_li
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59c454debe |
@@ -23,10 +23,6 @@ env:
|
||||
# Jira
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
|
||||
GONG_ACCESS_KEY: ${{ secrets.GONG_ACCESS_KEY }}
|
||||
GONG_ACCESS_KEY_SECRET: ${{ secrets.GONG_ACCESS_KEY_SECRET }}
|
||||
|
||||
# Google
|
||||
GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
|
||||
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}
|
||||
|
||||
64
README.md
64
README.md
@@ -30,26 +30,30 @@ Keep knowledge and access controls sync-ed across over 40 connectors like Google
|
||||
Create custom AI agents with unique prompts, knowledge, and actions that the agents can take.
|
||||
Onyx can be deployed securely anywhere and for any scale - on a laptop, on-premise, or to cloud.
|
||||
|
||||
|
||||
<h3>Feature Highlights</h3>
|
||||
|
||||
**Deep research over your team's knowledge:**
|
||||
|
||||
https://private-user-images.githubusercontent.com/32520769/414509312-48392e83-95d0-4fb5-8650-a396e05e0a32.mp4?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3Mzk5Mjg2MzYsIm5iZiI6MTczOTkyODMzNiwicGF0aCI6Ii8zMjUyMDc2OS80MTQ1MDkzMTItNDgzOTJlODMtOTVkMC00ZmI1LTg2NTAtYTM5NmUwNWUwYTMyLm1wND9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNTAyMTklMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjUwMjE5VDAxMjUzNlomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWFhMzk5Njg2Y2Y5YjFmNDNiYTQ2YzM5ZTg5YWJiYTU2NWMyY2YwNmUyODE2NWUxMDRiMWQxZWJmODI4YTA0MTUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0.a9D8A0sgKE9AoaoE-mfFbJ6_OKYeqaf7TZ4Han2JfW8
|
||||
|
||||
|
||||
**Use Onyx as a secure AI Chat with any LLM:**
|
||||
|
||||

|
||||
|
||||
|
||||
**Easily set up connectors to your apps:**
|
||||
|
||||

|
||||
|
||||
|
||||
**Access Onyx where your team already works:**
|
||||
|
||||

|
||||
|
||||
## Deployment
|
||||
|
||||
## Deployment
|
||||
**To try it out for free and get started in seconds, check out [Onyx Cloud](https://cloud.onyx.app/signup)**.
|
||||
|
||||
Onyx can also be run locally (even on a laptop) or deployed on a virtual machine with a single
|
||||
@@ -58,23 +62,23 @@ Onyx can also be run locally (even on a laptop) or deployed on a virtual machine
|
||||
We also have built-in support for high-availability/scalable deployment on Kubernetes.
|
||||
References [here](https://github.com/onyx-dot-app/onyx/tree/main/deployment).
|
||||
|
||||
## 🔍 Other Notable Benefits of Onyx
|
||||
|
||||
## 🔍 Other Notable Benefits of Onyx
|
||||
- Custom deep learning models for indexing and inference time, only through Onyx + learning from user feedback.
|
||||
- Flexible security features like SSO (OIDC/SAML/OAuth2), RBAC, encryption of credentials, etc.
|
||||
- Knowledge curation features like document-sets, query history, usage analytics, etc.
|
||||
- Scalable deployment options tested up to many tens of thousands users and hundreds of millions of documents.
|
||||
|
||||
## 🚧 Roadmap
|
||||
|
||||
## 🚧 Roadmap
|
||||
- New methods in information retrieval (StructRAG, LightGraphRAG, etc.)
|
||||
- Personalized Search
|
||||
- Organizational understanding and ability to locate and suggest experts from your team.
|
||||
- Code Search
|
||||
- SQL and Structured Query Language
|
||||
|
||||
## 🔌 Connectors
|
||||
|
||||
## 🔌 Connectors
|
||||
Keep knowledge and access up to sync across 40+ connectors:
|
||||
|
||||
- Google Drive
|
||||
@@ -95,65 +99,19 @@ Keep knowledge and access up to sync across 40+ connectors:
|
||||
|
||||
See the full list [here](https://docs.onyx.app/connectors).
|
||||
|
||||
## 📚 Licensing
|
||||
|
||||
## 📚 Licensing
|
||||
There are two editions of Onyx:
|
||||
|
||||
- Onyx Community Edition (CE) is available freely under the MIT Expat license. Simply follow the Deployment guide above.
|
||||
- Onyx Enterprise Edition (EE) includes extra features that are primarily useful for larger organizations.
|
||||
For feature details, check out [our website](https://www.onyx.app/pricing).
|
||||
For feature details, check out [our website](https://www.onyx.app/pricing).
|
||||
|
||||
To try the Onyx Enterprise Edition:
|
||||
|
||||
1. Checkout [Onyx Cloud](https://cloud.onyx.app/signup).
|
||||
2. For self-hosting the Enterprise Edition, contact us at [founders@onyx.app](mailto:founders@onyx.app) or book a call with us on our [Cal](https://cal.com/team/onyx/founders).
|
||||
|
||||
## 💡 Contributing
|
||||
|
||||
## 💡 Contributing
|
||||
Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
|
||||
|
||||
# YC Company Twitter Scraper
|
||||
|
||||
A script that scrapes YC company pages and extracts Twitter/X.com links.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.7+
|
||||
- Playwright
|
||||
|
||||
## Installation
|
||||
|
||||
1. Install the required packages:
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Install Playwright browsers:
|
||||
```
|
||||
playwright install
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Run the script with default settings:
|
||||
|
||||
```
|
||||
python scrape_yc_twitter.py
|
||||
```
|
||||
|
||||
This will scrape the YC companies from recent batches (W23, S23, S24, F24, S22, W22) and save the Twitter links to `twitter_links.txt`.
|
||||
|
||||
### Custom URL and Output
|
||||
|
||||
```
|
||||
python scrape_yc_twitter.py --url "https://www.ycombinator.com/companies?batch=W24" --output "w24_twitter.txt"
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
1. Navigates to the specified YC companies page
|
||||
2. Scrolls down to load all company cards
|
||||
3. Extracts links to individual company pages
|
||||
4. Visits each company page and extracts Twitter/X.com links
|
||||
5. Saves the results to a text file
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
# YC Company Twitter Scraper
|
||||
|
||||
A script that scrapes YC company pages and extracts Twitter/X.com links.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.7+
|
||||
- Playwright
|
||||
|
||||
## Installation
|
||||
|
||||
1. Install the required packages:
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Install Playwright browsers:
|
||||
```
|
||||
playwright install
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Run the script with default settings:
|
||||
|
||||
```
|
||||
python scrape_yc_twitter.py
|
||||
```
|
||||
|
||||
This will scrape the YC companies from recent batches (W23, S23, S24, F24, S22, W22) and save the Twitter links to `twitter_links.txt`.
|
||||
|
||||
### Custom URL and Output
|
||||
|
||||
```
|
||||
python scrape_yc_twitter.py --url "https://www.ycombinator.com/companies?batch=W24" --output "w24_twitter.txt"
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
1. Navigates to the specified YC companies page
|
||||
2. Scrolls down to load all company cards
|
||||
3. Extracts links to individual company pages
|
||||
4. Visits each company page and extracts Twitter/X.com links
|
||||
5. Saves the results to a text file
|
||||
@@ -51,9 +51,9 @@ def _get_objects_access_for_user_email_from_salesforce(
|
||||
|
||||
# This is cached in the function so the first query takes an extra 0.1-0.3 seconds
|
||||
# but subsequent queries by the same user are essentially instant
|
||||
start_time = time.monotonic()
|
||||
start_time = time.time()
|
||||
user_id = get_salesforce_user_id_from_email(salesforce_client, user_email)
|
||||
end_time = time.monotonic()
|
||||
end_time = time.time()
|
||||
logger.info(
|
||||
f"Time taken to get Salesforce user ID: {end_time - start_time} seconds"
|
||||
)
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
from simple_salesforce import Salesforce
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.connectors.salesforce.sqlite_functions import get_user_id_by_email
|
||||
from onyx.connectors.salesforce.sqlite_functions import init_db
|
||||
from onyx.connectors.salesforce.sqlite_functions import NULL_ID_STRING
|
||||
from onyx.connectors.salesforce.sqlite_functions import update_email_to_id_table
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.document import get_cc_pairs_for_document
|
||||
from onyx.utils.logger import setup_logger
|
||||
@@ -24,8 +28,6 @@ def get_any_salesforce_client_for_doc_id(
|
||||
E.g. there are 2 different credential sets for 2 different salesforce cc_pairs
|
||||
but only one has the permissions to access the permissions needed for the query.
|
||||
"""
|
||||
|
||||
# NOTE: this global seems very very bad
|
||||
global _ANY_SALESFORCE_CLIENT
|
||||
if _ANY_SALESFORCE_CLIENT is None:
|
||||
cc_pairs = get_cc_pairs_for_document(db_session, doc_id)
|
||||
@@ -82,21 +84,35 @@ def get_salesforce_user_id_from_email(
|
||||
salesforce database. (Around 0.1-0.3 seconds)
|
||||
If it's cached or stored in the local salesforce database, it's fast (<0.001 seconds).
|
||||
"""
|
||||
|
||||
# NOTE: this global seems bad
|
||||
global _CACHED_SF_EMAIL_TO_ID_MAP
|
||||
if user_email in _CACHED_SF_EMAIL_TO_ID_MAP:
|
||||
if _CACHED_SF_EMAIL_TO_ID_MAP[user_email] is not None:
|
||||
return _CACHED_SF_EMAIL_TO_ID_MAP[user_email]
|
||||
|
||||
# some caching via sqlite existed here before ... check history if interested
|
||||
|
||||
# ...query Salesforce and store the result in the database
|
||||
user_id = _query_salesforce_user_id(sf_client, user_email)
|
||||
db_exists = True
|
||||
try:
|
||||
# Check if the user is already in the database
|
||||
user_id = get_user_id_by_email(user_email)
|
||||
except Exception:
|
||||
init_db()
|
||||
try:
|
||||
user_id = get_user_id_by_email(user_email)
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking if user is in database: {e}")
|
||||
user_id = None
|
||||
db_exists = False
|
||||
|
||||
# If no entry is found in the database (indicated by user_id being None)...
|
||||
if user_id is None:
|
||||
# ...query Salesforce and store the result in the database
|
||||
user_id = _query_salesforce_user_id(sf_client, user_email)
|
||||
if db_exists:
|
||||
update_email_to_id_table(user_email, user_id)
|
||||
return user_id
|
||||
elif user_id is None:
|
||||
return None
|
||||
elif user_id == NULL_ID_STRING:
|
||||
return None
|
||||
|
||||
# If the found user_id is real, cache it
|
||||
_CACHED_SF_EMAIL_TO_ID_MAP[user_email] = user_id
|
||||
return user_id
|
||||
|
||||
@@ -5,14 +5,12 @@ from slack_sdk import WebClient
|
||||
from ee.onyx.external_permissions.slack.utils import fetch_user_id_to_email_map
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
|
||||
from onyx.connectors.slack.connector import get_channels
|
||||
from onyx.connectors.slack.connector import make_paginated_slack_api_call_w_retries
|
||||
from onyx.connectors.slack.connector import SlackConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -103,12 +101,7 @@ def _get_slack_document_access(
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
slack_connector = SlackConnector(**cc_pair.connector.connector_specific_config)
|
||||
|
||||
# Use credentials provider instead of directly loading credentials
|
||||
provider = OnyxDBCredentialsProvider(
|
||||
get_current_tenant_id(), "slack", cc_pair.credential.id
|
||||
)
|
||||
slack_connector.set_credentials_provider(provider)
|
||||
slack_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
slim_doc_generator = slack_connector.retrieve_all_slim_documents(callback=callback)
|
||||
|
||||
|
||||
@@ -51,7 +51,6 @@ def _get_slack_group_members_email(
|
||||
|
||||
|
||||
def slack_group_sync(
|
||||
tenant_id: str,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> list[ExternalUserGroup]:
|
||||
slack_client = WebClient(
|
||||
|
||||
@@ -15,7 +15,6 @@ from ee.onyx.external_permissions.post_query_censoring import (
|
||||
DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION,
|
||||
)
|
||||
from ee.onyx.external_permissions.slack.doc_sync import slack_doc_sync
|
||||
from ee.onyx.external_permissions.slack.group_sync import slack_group_sync
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
@@ -57,7 +56,6 @@ DOC_PERMISSIONS_FUNC_MAP: dict[DocumentSource, DocSyncFuncType] = {
|
||||
GROUP_PERMISSIONS_FUNC_MAP: dict[DocumentSource, GroupSyncFuncType] = {
|
||||
DocumentSource.GOOGLE_DRIVE: gdrive_group_sync,
|
||||
DocumentSource.CONFLUENCE: confluence_group_sync,
|
||||
DocumentSource.SLACK: slack_group_sync,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
from collections.abc import Hashable
|
||||
from typing import cast
|
||||
|
||||
from langchain_core.runnables.config import RunnableConfig
|
||||
from langgraph.types import Send
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import ObjectInformationInput
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import (
|
||||
ObjectResearchInformationUpdate,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import ObjectSourceInput
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import (
|
||||
SearchSourcesObjectsUpdate,
|
||||
)
|
||||
from onyx.agents.agent_search.models import GraphConfig
|
||||
|
||||
|
||||
def parallel_object_source_research_edge(
|
||||
state: SearchSourcesObjectsUpdate, config: RunnableConfig
|
||||
) -> list[Send | Hashable]:
|
||||
"""
|
||||
LangGraph edge to parallelize the research for an individual object and source
|
||||
"""
|
||||
|
||||
search_objects = state.analysis_objects
|
||||
search_sources = state.analysis_sources
|
||||
|
||||
object_source_combinations = [
|
||||
(object, source) for object in search_objects for source in search_sources
|
||||
]
|
||||
|
||||
return [
|
||||
Send(
|
||||
"research_object_source",
|
||||
ObjectSourceInput(
|
||||
object_source_combination=object_source_combination,
|
||||
log_messages=[],
|
||||
),
|
||||
)
|
||||
for object_source_combination in object_source_combinations
|
||||
]
|
||||
|
||||
|
||||
def parallel_object_research_consolidation_edge(
|
||||
state: ObjectResearchInformationUpdate, config: RunnableConfig
|
||||
) -> list[Send | Hashable]:
|
||||
"""
|
||||
LangGraph edge to parallelize the research for an individual object and source
|
||||
"""
|
||||
cast(GraphConfig, config["metadata"]["config"])
|
||||
object_research_information_results = state.object_research_information_results
|
||||
|
||||
return [
|
||||
Send(
|
||||
"consolidate_object_research",
|
||||
ObjectInformationInput(
|
||||
object_information=object_information,
|
||||
log_messages=[],
|
||||
),
|
||||
)
|
||||
for object_information in object_research_information_results
|
||||
]
|
||||
@@ -1,103 +0,0 @@
|
||||
from langgraph.graph import END
|
||||
from langgraph.graph import START
|
||||
from langgraph.graph import StateGraph
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.edges import (
|
||||
parallel_object_research_consolidation_edge,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.edges import (
|
||||
parallel_object_source_research_edge,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.nodes.a1_search_objects import (
|
||||
search_objects,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.nodes.a2_research_object_source import (
|
||||
research_object_source,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.nodes.a3_structure_research_by_object import (
|
||||
structure_research_by_object,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.nodes.a4_consolidate_object_research import (
|
||||
consolidate_object_research,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.nodes.a5_consolidate_research import (
|
||||
consolidate_research,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import MainInput
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import MainState
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
test_mode = False
|
||||
|
||||
|
||||
def divide_and_conquer_graph_builder(test_mode: bool = False) -> StateGraph:
|
||||
"""
|
||||
LangGraph graph builder for the knowledge graph search process.
|
||||
"""
|
||||
|
||||
graph = StateGraph(
|
||||
state_schema=MainState,
|
||||
input=MainInput,
|
||||
)
|
||||
|
||||
### Add nodes ###
|
||||
|
||||
graph.add_node(
|
||||
"search_objects",
|
||||
search_objects,
|
||||
)
|
||||
|
||||
graph.add_node(
|
||||
"structure_research_by_source",
|
||||
structure_research_by_object,
|
||||
)
|
||||
|
||||
graph.add_node(
|
||||
"research_object_source",
|
||||
research_object_source,
|
||||
)
|
||||
|
||||
graph.add_node(
|
||||
"consolidate_object_research",
|
||||
consolidate_object_research,
|
||||
)
|
||||
|
||||
graph.add_node(
|
||||
"consolidate_research",
|
||||
consolidate_research,
|
||||
)
|
||||
|
||||
### Add edges ###
|
||||
|
||||
graph.add_edge(start_key=START, end_key="search_objects")
|
||||
|
||||
graph.add_conditional_edges(
|
||||
source="search_objects",
|
||||
path=parallel_object_source_research_edge,
|
||||
path_map=["research_object_source"],
|
||||
)
|
||||
|
||||
graph.add_edge(
|
||||
start_key="research_object_source",
|
||||
end_key="structure_research_by_source",
|
||||
)
|
||||
|
||||
graph.add_conditional_edges(
|
||||
source="structure_research_by_source",
|
||||
path=parallel_object_research_consolidation_edge,
|
||||
path_map=["consolidate_object_research"],
|
||||
)
|
||||
|
||||
graph.add_edge(
|
||||
start_key="consolidate_object_research",
|
||||
end_key="consolidate_research",
|
||||
)
|
||||
|
||||
graph.add_edge(
|
||||
start_key="consolidate_research",
|
||||
end_key=END,
|
||||
)
|
||||
|
||||
return graph
|
||||
@@ -1,159 +0,0 @@
|
||||
from typing import cast
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langgraph.types import StreamWriter
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
|
||||
from onyx.agents.agent_search.dc_search_analysis.ops import research
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import MainState
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import (
|
||||
SearchSourcesObjectsUpdate,
|
||||
)
|
||||
from onyx.agents.agent_search.models import GraphConfig
|
||||
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
|
||||
trim_prompt_piece,
|
||||
)
|
||||
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
|
||||
from onyx.chat.models import AgentAnswerPiece
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.prompts.agents.dc_prompts import DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT
|
||||
from onyx.prompts.agents.dc_prompts import DC_OBJECT_SEPARATOR
|
||||
from onyx.prompts.agents.dc_prompts import DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import run_with_timeout
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def search_objects(
|
||||
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
|
||||
) -> SearchSourcesObjectsUpdate:
|
||||
"""
|
||||
LangGraph node to start the agentic search process.
|
||||
"""
|
||||
|
||||
graph_config = cast(GraphConfig, config["metadata"]["config"])
|
||||
question = graph_config.inputs.search_request.query
|
||||
search_tool = graph_config.tooling.search_tool
|
||||
|
||||
if search_tool is None or graph_config.inputs.search_request.persona is None:
|
||||
raise ValueError("Search tool and persona must be provided for DivCon search")
|
||||
|
||||
try:
|
||||
instructions = graph_config.inputs.search_request.persona.prompts[
|
||||
0
|
||||
].system_prompt
|
||||
|
||||
agent_1_instructions = extract_section(
|
||||
instructions, "Agent Step 1:", "Agent Step 2:"
|
||||
)
|
||||
if agent_1_instructions is None:
|
||||
raise ValueError("Agent 1 instructions not found")
|
||||
|
||||
agent_1_base_data = extract_section(instructions, "|Start Data|", "|End Data|")
|
||||
|
||||
agent_1_task = extract_section(
|
||||
agent_1_instructions, "Task:", "Independent Research Sources:"
|
||||
)
|
||||
if agent_1_task is None:
|
||||
raise ValueError("Agent 1 task not found")
|
||||
|
||||
agent_1_independent_sources_str = extract_section(
|
||||
agent_1_instructions, "Independent Research Sources:", "Output Objective:"
|
||||
)
|
||||
if agent_1_independent_sources_str is None:
|
||||
raise ValueError("Agent 1 Independent Research Sources not found")
|
||||
|
||||
document_sources = [
|
||||
DocumentSource(x.strip().lower())
|
||||
for x in agent_1_independent_sources_str.split(DC_OBJECT_SEPARATOR)
|
||||
]
|
||||
|
||||
agent_1_output_objective = extract_section(
|
||||
agent_1_instructions, "Output Objective:"
|
||||
)
|
||||
if agent_1_output_objective is None:
|
||||
raise ValueError("Agent 1 output objective not found")
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Agent 1 instructions not found or not formatted correctly: {e}"
|
||||
)
|
||||
|
||||
# Extract objects
|
||||
|
||||
if agent_1_base_data is None:
|
||||
# Retrieve chunks for objects
|
||||
|
||||
retrieved_docs = research(question, search_tool)[:10]
|
||||
|
||||
document_texts_list = []
|
||||
for doc_num, doc in enumerate(retrieved_docs):
|
||||
chunk_text = "Document " + str(doc_num) + ":\n" + doc.content
|
||||
document_texts_list.append(chunk_text)
|
||||
|
||||
document_texts = "\n\n".join(document_texts_list)
|
||||
|
||||
dc_object_extraction_prompt = DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT.format(
|
||||
question=question,
|
||||
task=agent_1_task,
|
||||
document_text=document_texts,
|
||||
objects_of_interest=agent_1_output_objective,
|
||||
)
|
||||
else:
|
||||
dc_object_extraction_prompt = DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT.format(
|
||||
question=question,
|
||||
task=agent_1_task,
|
||||
base_data=agent_1_base_data,
|
||||
objects_of_interest=agent_1_output_objective,
|
||||
)
|
||||
|
||||
msg = [
|
||||
HumanMessage(
|
||||
content=trim_prompt_piece(
|
||||
config=graph_config.tooling.primary_llm.config,
|
||||
prompt_piece=dc_object_extraction_prompt,
|
||||
reserved_str="",
|
||||
),
|
||||
)
|
||||
]
|
||||
primary_llm = graph_config.tooling.primary_llm
|
||||
# Grader
|
||||
try:
|
||||
llm_response = run_with_timeout(
|
||||
30,
|
||||
primary_llm.invoke,
|
||||
prompt=msg,
|
||||
timeout_override=30,
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
cleaned_response = (
|
||||
str(llm_response.content)
|
||||
.replace("```json\n", "")
|
||||
.replace("\n```", "")
|
||||
.replace("\n", "")
|
||||
)
|
||||
cleaned_response = cleaned_response.split("OBJECTS:")[1]
|
||||
object_list = [x.strip() for x in cleaned_response.split(";")]
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error in search_objects: {e}")
|
||||
|
||||
write_custom_event(
|
||||
"initial_agent_answer",
|
||||
AgentAnswerPiece(
|
||||
answer_piece=" Researching the individual objects for each source type... ",
|
||||
level=0,
|
||||
level_question_num=0,
|
||||
answer_type="agent_level_answer",
|
||||
),
|
||||
writer,
|
||||
)
|
||||
|
||||
return SearchSourcesObjectsUpdate(
|
||||
analysis_objects=object_list,
|
||||
analysis_sources=document_sources,
|
||||
log_messages=["Agent 1 Task done"],
|
||||
)
|
||||
@@ -1,185 +0,0 @@
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
from typing import cast
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langgraph.types import StreamWriter
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
|
||||
from onyx.agents.agent_search.dc_search_analysis.ops import research
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import ObjectSourceInput
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import (
|
||||
ObjectSourceResearchUpdate,
|
||||
)
|
||||
from onyx.agents.agent_search.models import GraphConfig
|
||||
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
|
||||
trim_prompt_piece,
|
||||
)
|
||||
from onyx.prompts.agents.dc_prompts import DC_OBJECT_SOURCE_RESEARCH_PROMPT
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import run_with_timeout
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def research_object_source(
|
||||
state: ObjectSourceInput,
|
||||
config: RunnableConfig,
|
||||
writer: StreamWriter = lambda _: None,
|
||||
) -> ObjectSourceResearchUpdate:
|
||||
"""
|
||||
LangGraph node to start the agentic search process.
|
||||
"""
|
||||
datetime.now()
|
||||
|
||||
graph_config = cast(GraphConfig, config["metadata"]["config"])
|
||||
graph_config.inputs.search_request.query
|
||||
search_tool = graph_config.tooling.search_tool
|
||||
question = graph_config.inputs.search_request.query
|
||||
object, document_source = state.object_source_combination
|
||||
|
||||
if search_tool is None or graph_config.inputs.search_request.persona is None:
|
||||
raise ValueError("Search tool and persona must be provided for DivCon search")
|
||||
|
||||
try:
|
||||
instructions = graph_config.inputs.search_request.persona.prompts[
|
||||
0
|
||||
].system_prompt
|
||||
|
||||
agent_2_instructions = extract_section(
|
||||
instructions, "Agent Step 2:", "Agent Step 3:"
|
||||
)
|
||||
if agent_2_instructions is None:
|
||||
raise ValueError("Agent 2 instructions not found")
|
||||
|
||||
agent_2_task = extract_section(
|
||||
agent_2_instructions, "Task:", "Independent Research Sources:"
|
||||
)
|
||||
if agent_2_task is None:
|
||||
raise ValueError("Agent 2 task not found")
|
||||
|
||||
agent_2_time_cutoff = extract_section(
|
||||
agent_2_instructions, "Time Cutoff:", "Research Topics:"
|
||||
)
|
||||
|
||||
agent_2_research_topics = extract_section(
|
||||
agent_2_instructions, "Research Topics:", "Output Objective"
|
||||
)
|
||||
|
||||
agent_2_output_objective = extract_section(
|
||||
agent_2_instructions, "Output Objective:"
|
||||
)
|
||||
if agent_2_output_objective is None:
|
||||
raise ValueError("Agent 2 output objective not found")
|
||||
|
||||
except Exception:
|
||||
raise ValueError(
|
||||
"Agent 1 instructions not found or not formatted correctly: {e}"
|
||||
)
|
||||
|
||||
# Populate prompt
|
||||
|
||||
# Retrieve chunks for objects
|
||||
|
||||
if agent_2_time_cutoff is not None and agent_2_time_cutoff.strip() != "":
|
||||
if agent_2_time_cutoff.strip().endswith("d"):
|
||||
try:
|
||||
days = int(agent_2_time_cutoff.strip()[:-1])
|
||||
agent_2_source_start_time = datetime.now(timezone.utc) - timedelta(
|
||||
days=days
|
||||
)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid time cutoff format: {agent_2_time_cutoff}. Expected format: '<number>d'"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid time cutoff format: {agent_2_time_cutoff}. Expected format: '<number>d'"
|
||||
)
|
||||
else:
|
||||
agent_2_source_start_time = None
|
||||
|
||||
document_sources = [document_source] if document_source else None
|
||||
|
||||
if len(question.strip()) > 0:
|
||||
research_area = f"{question} for {object}"
|
||||
elif agent_2_research_topics and len(agent_2_research_topics.strip()) > 0:
|
||||
research_area = f"{agent_2_research_topics} for {object}"
|
||||
else:
|
||||
research_area = object
|
||||
|
||||
retrieved_docs = research(
|
||||
question=research_area,
|
||||
search_tool=search_tool,
|
||||
document_sources=document_sources,
|
||||
time_cutoff=agent_2_source_start_time,
|
||||
)
|
||||
|
||||
# Generate document text
|
||||
|
||||
document_texts_list = []
|
||||
for doc_num, doc in enumerate(retrieved_docs):
|
||||
chunk_text = "Document " + str(doc_num) + ":\n" + doc.content
|
||||
document_texts_list.append(chunk_text)
|
||||
|
||||
document_texts = "\n\n".join(document_texts_list)
|
||||
|
||||
# Built prompt
|
||||
|
||||
today = datetime.now().strftime("%A, %Y-%m-%d")
|
||||
|
||||
dc_object_source_research_prompt = (
|
||||
DC_OBJECT_SOURCE_RESEARCH_PROMPT.format(
|
||||
today=today,
|
||||
question=question,
|
||||
task=agent_2_task,
|
||||
document_text=document_texts,
|
||||
format=agent_2_output_objective,
|
||||
)
|
||||
.replace("---object---", object)
|
||||
.replace("---source---", document_source.value)
|
||||
)
|
||||
|
||||
# Run LLM
|
||||
|
||||
msg = [
|
||||
HumanMessage(
|
||||
content=trim_prompt_piece(
|
||||
config=graph_config.tooling.primary_llm.config,
|
||||
prompt_piece=dc_object_source_research_prompt,
|
||||
reserved_str="",
|
||||
),
|
||||
)
|
||||
]
|
||||
# fast_llm = graph_config.tooling.fast_llm
|
||||
primary_llm = graph_config.tooling.primary_llm
|
||||
llm = primary_llm
|
||||
# Grader
|
||||
try:
|
||||
llm_response = run_with_timeout(
|
||||
30,
|
||||
llm.invoke,
|
||||
prompt=msg,
|
||||
timeout_override=30,
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
cleaned_response = str(llm_response.content).replace("```json\n", "")
|
||||
cleaned_response = cleaned_response.split("RESEARCH RESULTS:")[1]
|
||||
object_research_results = {
|
||||
"object": object,
|
||||
"source": document_source.value,
|
||||
"research_result": cleaned_response,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error in research_object_source: {e}")
|
||||
|
||||
logger.debug("DivCon Step A2 - Object Source Research - completed for an object")
|
||||
|
||||
return ObjectSourceResearchUpdate(
|
||||
object_source_research_results=[object_research_results],
|
||||
log_messages=["Agent Step 2 done for one object"],
|
||||
)
|
||||
@@ -1,68 +0,0 @@
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langgraph.types import StreamWriter
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import MainState
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import (
|
||||
ObjectResearchInformationUpdate,
|
||||
)
|
||||
from onyx.agents.agent_search.models import GraphConfig
|
||||
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
|
||||
from onyx.chat.models import AgentAnswerPiece
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def structure_research_by_object(
|
||||
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
|
||||
) -> ObjectResearchInformationUpdate:
|
||||
"""
|
||||
LangGraph node to start the agentic search process.
|
||||
"""
|
||||
datetime.now()
|
||||
|
||||
graph_config = cast(GraphConfig, config["metadata"]["config"])
|
||||
graph_config.inputs.search_request.query
|
||||
|
||||
write_custom_event(
|
||||
"initial_agent_answer",
|
||||
AgentAnswerPiece(
|
||||
answer_piece=" consolidating the information across source types for each object...",
|
||||
level=0,
|
||||
level_question_num=0,
|
||||
answer_type="agent_level_answer",
|
||||
),
|
||||
writer,
|
||||
)
|
||||
|
||||
object_source_research_results = state.object_source_research_results
|
||||
|
||||
object_research_information_results: List[Dict[str, str]] = []
|
||||
object_research_information_results_list: Dict[str, List[str]] = defaultdict(list)
|
||||
|
||||
for object_source_research in object_source_research_results:
|
||||
object = object_source_research["object"]
|
||||
source = object_source_research["source"]
|
||||
research_result = object_source_research["research_result"]
|
||||
|
||||
object_research_information_results_list[object].append(
|
||||
f"Source: {source}\n{research_result}"
|
||||
)
|
||||
|
||||
for object, information in object_research_information_results_list.items():
|
||||
object_research_information_results.append(
|
||||
{"object": object, "information": "\n".join(information)}
|
||||
)
|
||||
|
||||
logger.debug("DivCon Step A3 - Object Research Information Structuring - completed")
|
||||
|
||||
return ObjectResearchInformationUpdate(
|
||||
object_research_information_results=object_research_information_results,
|
||||
log_messages=["A3 - Object Research Information structured"],
|
||||
)
|
||||
@@ -1,107 +0,0 @@
|
||||
from typing import cast
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langgraph.types import StreamWriter
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import ObjectInformationInput
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import ObjectResearchUpdate
|
||||
from onyx.agents.agent_search.models import GraphConfig
|
||||
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
|
||||
trim_prompt_piece,
|
||||
)
|
||||
from onyx.prompts.agents.dc_prompts import DC_OBJECT_CONSOLIDATION_PROMPT
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import run_with_timeout
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def consolidate_object_research(
|
||||
state: ObjectInformationInput,
|
||||
config: RunnableConfig,
|
||||
writer: StreamWriter = lambda _: None,
|
||||
) -> ObjectResearchUpdate:
|
||||
"""
|
||||
LangGraph node to start the agentic search process.
|
||||
"""
|
||||
graph_config = cast(GraphConfig, config["metadata"]["config"])
|
||||
graph_config.inputs.search_request.query
|
||||
search_tool = graph_config.tooling.search_tool
|
||||
question = graph_config.inputs.search_request.query
|
||||
|
||||
if search_tool is None or graph_config.inputs.search_request.persona is None:
|
||||
raise ValueError("Search tool and persona must be provided for DivCon search")
|
||||
|
||||
instructions = graph_config.inputs.search_request.persona.prompts[0].system_prompt
|
||||
|
||||
agent_4_instructions = extract_section(
|
||||
instructions, "Agent Step 4:", "Agent Step 5:"
|
||||
)
|
||||
if agent_4_instructions is None:
|
||||
raise ValueError("Agent 4 instructions not found")
|
||||
agent_4_output_objective = extract_section(
|
||||
agent_4_instructions, "Output Objective:"
|
||||
)
|
||||
if agent_4_output_objective is None:
|
||||
raise ValueError("Agent 4 output objective not found")
|
||||
|
||||
object_information = state.object_information
|
||||
|
||||
object = object_information["object"]
|
||||
information = object_information["information"]
|
||||
|
||||
# Create a prompt for the object consolidation
|
||||
|
||||
dc_object_consolidation_prompt = DC_OBJECT_CONSOLIDATION_PROMPT.format(
|
||||
question=question,
|
||||
object=object,
|
||||
information=information,
|
||||
format=agent_4_output_objective,
|
||||
)
|
||||
|
||||
# Run LLM
|
||||
|
||||
msg = [
|
||||
HumanMessage(
|
||||
content=trim_prompt_piece(
|
||||
config=graph_config.tooling.primary_llm.config,
|
||||
prompt_piece=dc_object_consolidation_prompt,
|
||||
reserved_str="",
|
||||
),
|
||||
)
|
||||
]
|
||||
graph_config.tooling.primary_llm
|
||||
# fast_llm = graph_config.tooling.fast_llm
|
||||
primary_llm = graph_config.tooling.primary_llm
|
||||
llm = primary_llm
|
||||
# Grader
|
||||
try:
|
||||
llm_response = run_with_timeout(
|
||||
30,
|
||||
llm.invoke,
|
||||
prompt=msg,
|
||||
timeout_override=30,
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
cleaned_response = str(llm_response.content).replace("```json\n", "")
|
||||
consolidated_information = cleaned_response.split("INFORMATION:")[1]
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error in consolidate_object_research: {e}")
|
||||
|
||||
object_research_results = {
|
||||
"object": object,
|
||||
"research_result": consolidated_information,
|
||||
}
|
||||
|
||||
logger.debug(
|
||||
"DivCon Step A4 - Object Research Consolidation - completed for an object"
|
||||
)
|
||||
|
||||
return ObjectResearchUpdate(
|
||||
object_research_results=[object_research_results],
|
||||
log_messages=["Agent Source Consilidation done"],
|
||||
)
|
||||
@@ -1,164 +0,0 @@
|
||||
from datetime import datetime
|
||||
from typing import cast
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.runnables import RunnableConfig
|
||||
from langgraph.types import StreamWriter
|
||||
|
||||
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import MainState
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import ResearchUpdate
|
||||
from onyx.agents.agent_search.models import GraphConfig
|
||||
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
|
||||
trim_prompt_piece,
|
||||
)
|
||||
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
|
||||
from onyx.chat.models import AgentAnswerPiece
|
||||
from onyx.prompts.agents.dc_prompts import DC_FORMATTING_NO_BASE_DATA_PROMPT
|
||||
from onyx.prompts.agents.dc_prompts import DC_FORMATTING_WITH_BASE_DATA_PROMPT
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import run_with_timeout
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def consolidate_research(
|
||||
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
|
||||
) -> ResearchUpdate:
|
||||
"""
|
||||
LangGraph node to start the agentic search process.
|
||||
"""
|
||||
datetime.now()
|
||||
|
||||
graph_config = cast(GraphConfig, config["metadata"]["config"])
|
||||
graph_config.inputs.search_request.query
|
||||
|
||||
search_tool = graph_config.tooling.search_tool
|
||||
|
||||
write_custom_event(
|
||||
"initial_agent_answer",
|
||||
AgentAnswerPiece(
|
||||
answer_piece=" generating the answer\n\n\n",
|
||||
level=0,
|
||||
level_question_num=0,
|
||||
answer_type="agent_level_answer",
|
||||
),
|
||||
writer,
|
||||
)
|
||||
|
||||
if search_tool is None or graph_config.inputs.search_request.persona is None:
|
||||
raise ValueError("Search tool and persona must be provided for DivCon search")
|
||||
|
||||
# Populate prompt
|
||||
instructions = graph_config.inputs.search_request.persona.prompts[0].system_prompt
|
||||
|
||||
try:
|
||||
agent_5_instructions = extract_section(
|
||||
instructions, "Agent Step 5:", "Agent End"
|
||||
)
|
||||
if agent_5_instructions is None:
|
||||
raise ValueError("Agent 5 instructions not found")
|
||||
agent_5_base_data = extract_section(instructions, "|Start Data|", "|End Data|")
|
||||
agent_5_task = extract_section(
|
||||
agent_5_instructions, "Task:", "Independent Research Sources:"
|
||||
)
|
||||
if agent_5_task is None:
|
||||
raise ValueError("Agent 5 task not found")
|
||||
agent_5_output_objective = extract_section(
|
||||
agent_5_instructions, "Output Objective:"
|
||||
)
|
||||
if agent_5_output_objective is None:
|
||||
raise ValueError("Agent 5 output objective not found")
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
f"Instructions for Agent Step 5 were not properly formatted: {e}"
|
||||
)
|
||||
|
||||
research_result_list = []
|
||||
|
||||
if agent_5_task.strip() == "*concatenate*":
|
||||
object_research_results = state.object_research_results
|
||||
|
||||
for object_research_result in object_research_results:
|
||||
object = object_research_result["object"]
|
||||
research_result = object_research_result["research_result"]
|
||||
research_result_list.append(f"Object: {object}\n\n{research_result}")
|
||||
|
||||
research_results = "\n\n".join(research_result_list)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Only '*concatenate*' is currently supported")
|
||||
|
||||
# Create a prompt for the object consolidation
|
||||
|
||||
if agent_5_base_data is None:
|
||||
dc_formatting_prompt = DC_FORMATTING_NO_BASE_DATA_PROMPT.format(
|
||||
text=research_results,
|
||||
format=agent_5_output_objective,
|
||||
)
|
||||
else:
|
||||
dc_formatting_prompt = DC_FORMATTING_WITH_BASE_DATA_PROMPT.format(
|
||||
base_data=agent_5_base_data,
|
||||
text=research_results,
|
||||
format=agent_5_output_objective,
|
||||
)
|
||||
|
||||
# Run LLM
|
||||
|
||||
msg = [
|
||||
HumanMessage(
|
||||
content=trim_prompt_piece(
|
||||
config=graph_config.tooling.primary_llm.config,
|
||||
prompt_piece=dc_formatting_prompt,
|
||||
reserved_str="",
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
dispatch_timings: list[float] = []
|
||||
|
||||
primary_model = graph_config.tooling.primary_llm
|
||||
|
||||
def stream_initial_answer() -> list[str]:
|
||||
response: list[str] = []
|
||||
for message in primary_model.stream(msg, timeout_override=30, max_tokens=None):
|
||||
# TODO: in principle, the answer here COULD contain images, but we don't support that yet
|
||||
content = message.content
|
||||
if not isinstance(content, str):
|
||||
raise ValueError(
|
||||
f"Expected content to be a string, but got {type(content)}"
|
||||
)
|
||||
start_stream_token = datetime.now()
|
||||
|
||||
write_custom_event(
|
||||
"initial_agent_answer",
|
||||
AgentAnswerPiece(
|
||||
answer_piece=content,
|
||||
level=0,
|
||||
level_question_num=0,
|
||||
answer_type="agent_level_answer",
|
||||
),
|
||||
writer,
|
||||
)
|
||||
end_stream_token = datetime.now()
|
||||
dispatch_timings.append(
|
||||
(end_stream_token - start_stream_token).microseconds
|
||||
)
|
||||
response.append(content)
|
||||
return response
|
||||
|
||||
try:
|
||||
_ = run_with_timeout(
|
||||
60,
|
||||
stream_initial_answer,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error in consolidate_research: {e}")
|
||||
|
||||
logger.debug("DivCon Step A5 - Final Generation - completed")
|
||||
|
||||
return ResearchUpdate(
|
||||
research_results=research_results,
|
||||
log_messages=["Agent Source Consilidation done"],
|
||||
)
|
||||
@@ -1,61 +0,0 @@
|
||||
from datetime import datetime
|
||||
from typing import cast
|
||||
|
||||
from onyx.chat.models import LlmDoc
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.context.search.models import InferenceSection
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.tools.models import SearchToolOverrideKwargs
|
||||
from onyx.tools.tool_implementations.search.search_tool import (
|
||||
FINAL_CONTEXT_DOCUMENTS_ID,
|
||||
)
|
||||
from onyx.tools.tool_implementations.search.search_tool import SearchTool
|
||||
|
||||
|
||||
def research(
|
||||
question: str,
|
||||
search_tool: SearchTool,
|
||||
document_sources: list[DocumentSource] | None = None,
|
||||
time_cutoff: datetime | None = None,
|
||||
) -> list[LlmDoc]:
|
||||
# new db session to avoid concurrency issues
|
||||
|
||||
callback_container: list[list[InferenceSection]] = []
|
||||
retrieved_docs: list[LlmDoc] = []
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
for tool_response in search_tool.run(
|
||||
query=question,
|
||||
override_kwargs=SearchToolOverrideKwargs(
|
||||
force_no_rerank=False,
|
||||
alternate_db_session=db_session,
|
||||
retrieved_sections_callback=callback_container.append,
|
||||
skip_query_analysis=True,
|
||||
document_sources=document_sources,
|
||||
time_cutoff=time_cutoff,
|
||||
),
|
||||
):
|
||||
# get retrieved docs to send to the rest of the graph
|
||||
if tool_response.id == FINAL_CONTEXT_DOCUMENTS_ID:
|
||||
retrieved_docs = cast(list[LlmDoc], tool_response.response)[:10]
|
||||
break
|
||||
return retrieved_docs
|
||||
|
||||
|
||||
def extract_section(
|
||||
text: str, start_marker: str, end_marker: str | None = None
|
||||
) -> str | None:
|
||||
"""Extract text between markers, returning None if markers not found"""
|
||||
parts = text.split(start_marker)
|
||||
|
||||
if len(parts) == 1:
|
||||
return None
|
||||
|
||||
after_start = parts[1].strip()
|
||||
|
||||
if not end_marker:
|
||||
return after_start
|
||||
|
||||
extract = after_start.split(end_marker)[0]
|
||||
|
||||
return extract.strip()
|
||||
@@ -1,72 +0,0 @@
|
||||
from operator import add
|
||||
from typing import Annotated
|
||||
from typing import Dict
|
||||
from typing import TypedDict
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.agents.agent_search.core_state import CoreState
|
||||
from onyx.agents.agent_search.orchestration.states import ToolCallUpdate
|
||||
from onyx.agents.agent_search.orchestration.states import ToolChoiceInput
|
||||
from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
|
||||
from onyx.configs.constants import DocumentSource
|
||||
|
||||
|
||||
### States ###
|
||||
class LoggerUpdate(BaseModel):
|
||||
log_messages: Annotated[list[str], add] = []
|
||||
|
||||
|
||||
class SearchSourcesObjectsUpdate(LoggerUpdate):
|
||||
analysis_objects: list[str] = []
|
||||
analysis_sources: list[DocumentSource] = []
|
||||
|
||||
|
||||
class ObjectSourceInput(LoggerUpdate):
|
||||
object_source_combination: tuple[str, DocumentSource]
|
||||
|
||||
|
||||
class ObjectSourceResearchUpdate(LoggerUpdate):
|
||||
object_source_research_results: Annotated[list[Dict[str, str]], add] = []
|
||||
|
||||
|
||||
class ObjectInformationInput(LoggerUpdate):
|
||||
object_information: Dict[str, str]
|
||||
|
||||
|
||||
class ObjectResearchInformationUpdate(LoggerUpdate):
|
||||
object_research_information_results: Annotated[list[Dict[str, str]], add] = []
|
||||
|
||||
|
||||
class ObjectResearchUpdate(LoggerUpdate):
|
||||
object_research_results: Annotated[list[Dict[str, str]], add] = []
|
||||
|
||||
|
||||
class ResearchUpdate(LoggerUpdate):
|
||||
research_results: str | None = None
|
||||
|
||||
|
||||
## Graph Input State
|
||||
class MainInput(CoreState):
|
||||
pass
|
||||
|
||||
|
||||
## Graph State
|
||||
class MainState(
|
||||
# This includes the core state
|
||||
MainInput,
|
||||
ToolChoiceInput,
|
||||
ToolCallUpdate,
|
||||
ToolChoiceUpdate,
|
||||
SearchSourcesObjectsUpdate,
|
||||
ObjectSourceResearchUpdate,
|
||||
ObjectResearchInformationUpdate,
|
||||
ObjectResearchUpdate,
|
||||
ResearchUpdate,
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
## Graph Output State - presently not used
|
||||
class MainOutput(TypedDict):
|
||||
log_messages: list[str]
|
||||
@@ -8,10 +8,6 @@ from langgraph.graph.state import CompiledStateGraph
|
||||
|
||||
from onyx.agents.agent_search.basic.graph_builder import basic_graph_builder
|
||||
from onyx.agents.agent_search.basic.states import BasicInput
|
||||
from onyx.agents.agent_search.dc_search_analysis.graph_builder import (
|
||||
divide_and_conquer_graph_builder,
|
||||
)
|
||||
from onyx.agents.agent_search.dc_search_analysis.states import MainInput as DCMainInput
|
||||
from onyx.agents.agent_search.deep_search.main.graph_builder import (
|
||||
main_graph_builder as main_graph_builder_a,
|
||||
)
|
||||
@@ -86,7 +82,7 @@ def _parse_agent_event(
|
||||
def manage_sync_streaming(
|
||||
compiled_graph: CompiledStateGraph,
|
||||
config: GraphConfig,
|
||||
graph_input: BasicInput | MainInput | DCMainInput,
|
||||
graph_input: BasicInput | MainInput,
|
||||
) -> Iterable[StreamEvent]:
|
||||
message_id = config.persistence.message_id if config.persistence else None
|
||||
for event in compiled_graph.stream(
|
||||
@@ -100,7 +96,7 @@ def manage_sync_streaming(
|
||||
def run_graph(
|
||||
compiled_graph: CompiledStateGraph,
|
||||
config: GraphConfig,
|
||||
input: BasicInput | MainInput | DCMainInput,
|
||||
input: BasicInput | MainInput,
|
||||
) -> AnswerStream:
|
||||
config.behavior.perform_initial_search_decomposition = (
|
||||
INITIAL_SEARCH_DECOMPOSITION_ENABLED
|
||||
@@ -150,16 +146,6 @@ def run_basic_graph(
|
||||
return run_graph(compiled_graph, config, input)
|
||||
|
||||
|
||||
def run_dc_graph(
|
||||
config: GraphConfig,
|
||||
) -> AnswerStream:
|
||||
graph = divide_and_conquer_graph_builder()
|
||||
compiled_graph = graph.compile()
|
||||
input = DCMainInput(log_messages=[])
|
||||
config.inputs.search_request.query = config.inputs.search_request.query.strip()
|
||||
return run_graph(compiled_graph, config, input)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
for _ in range(1):
|
||||
query_start_time = datetime.now()
|
||||
|
||||
@@ -180,35 +180,3 @@ def binary_string_test_after_answer_separator(
|
||||
relevant_text = text.split(f"{separator}")[-1]
|
||||
|
||||
return binary_string_test(relevant_text, positive_value)
|
||||
|
||||
|
||||
def build_dc_search_prompt(
|
||||
question: str,
|
||||
original_question: str,
|
||||
docs: list[InferenceSection],
|
||||
persona_specification: str,
|
||||
config: LLMConfig,
|
||||
) -> list[SystemMessage | HumanMessage | AIMessage | ToolMessage]:
|
||||
system_message = SystemMessage(
|
||||
content=persona_specification,
|
||||
)
|
||||
|
||||
date_str = build_date_time_string()
|
||||
|
||||
docs_str = format_docs(docs)
|
||||
|
||||
docs_str = trim_prompt_piece(
|
||||
config,
|
||||
docs_str,
|
||||
SUB_QUESTION_RAG_PROMPT + question + original_question + date_str,
|
||||
)
|
||||
human_message = HumanMessage(
|
||||
content=SUB_QUESTION_RAG_PROMPT.format(
|
||||
question=question,
|
||||
original_question=original_question,
|
||||
context=docs_str,
|
||||
date_prompt=date_str,
|
||||
)
|
||||
)
|
||||
|
||||
return [system_message, human_message]
|
||||
|
||||
@@ -10,7 +10,6 @@ from onyx.agents.agent_search.models import GraphPersistence
|
||||
from onyx.agents.agent_search.models import GraphSearchConfig
|
||||
from onyx.agents.agent_search.models import GraphTooling
|
||||
from onyx.agents.agent_search.run_graph import run_basic_graph
|
||||
from onyx.agents.agent_search.run_graph import run_dc_graph
|
||||
from onyx.agents.agent_search.run_graph import run_main_graph
|
||||
from onyx.chat.models import AgentAnswerPiece
|
||||
from onyx.chat.models import AnswerPacket
|
||||
@@ -143,18 +142,11 @@ class Answer:
|
||||
yield from self._processed_stream
|
||||
return
|
||||
|
||||
if self.graph_config.behavior.use_agentic_search:
|
||||
run_langgraph = run_main_graph
|
||||
elif (
|
||||
self.graph_config.inputs.search_request.persona
|
||||
and self.graph_config.inputs.search_request.persona.description.startswith(
|
||||
"DivCon Beta Agent"
|
||||
)
|
||||
):
|
||||
run_langgraph = run_dc_graph
|
||||
else:
|
||||
run_langgraph = run_basic_graph
|
||||
|
||||
run_langgraph = (
|
||||
run_main_graph
|
||||
if self.graph_config.behavior.use_agentic_search
|
||||
else run_basic_graph
|
||||
)
|
||||
stream = run_langgraph(
|
||||
self.graph_config,
|
||||
)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import base64
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
@@ -8,8 +7,6 @@ from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.app_configs import GONG_CONNECTOR_START_TIME
|
||||
@@ -24,14 +21,13 @@ from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
GONG_BASE_URL = "https://us-34014.api.gong.io"
|
||||
|
||||
|
||||
class GongConnector(LoadConnector, PollConnector):
|
||||
BASE_URL = "https://api.gong.io"
|
||||
MAX_CALL_DETAILS_ATTEMPTS = 6
|
||||
CALL_DETAILS_DELAY = 30 # in seconds
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
workspaces: list[str] | None = None,
|
||||
@@ -45,23 +41,15 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
self.auth_token_basic: str | None = None
|
||||
self.hide_user_info = hide_user_info
|
||||
|
||||
retry_strategy = Retry(
|
||||
total=5,
|
||||
backoff_factor=2,
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
)
|
||||
def _get_auth_header(self) -> dict[str, str]:
|
||||
if self.auth_token_basic is None:
|
||||
raise ConnectorMissingCredentialError("Gong")
|
||||
|
||||
session = requests.Session()
|
||||
session.mount(GongConnector.BASE_URL, HTTPAdapter(max_retries=retry_strategy))
|
||||
self._session = session
|
||||
|
||||
@staticmethod
|
||||
def make_url(endpoint: str) -> str:
|
||||
url = f"{GongConnector.BASE_URL}{endpoint}"
|
||||
return url
|
||||
return {"Authorization": f"Basic {self.auth_token_basic}"}
|
||||
|
||||
def _get_workspace_id_map(self) -> dict[str, str]:
|
||||
response = self._session.get(GongConnector.make_url("/v2/workspaces"))
|
||||
url = f"{GONG_BASE_URL}/v2/workspaces"
|
||||
response = requests.get(url, headers=self._get_auth_header())
|
||||
response.raise_for_status()
|
||||
|
||||
workspaces_details = response.json().get("workspaces")
|
||||
@@ -78,6 +66,7 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
def _get_transcript_batches(
|
||||
self, start_datetime: str | None = None, end_datetime: str | None = None
|
||||
) -> Generator[list[dict[str, Any]], None, None]:
|
||||
url = f"{GONG_BASE_URL}/v2/calls/transcript"
|
||||
body: dict[str, dict] = {"filter": {}}
|
||||
if start_datetime:
|
||||
body["filter"]["fromDateTime"] = start_datetime
|
||||
@@ -105,8 +94,8 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
del body["filter"]["workspaceId"]
|
||||
|
||||
while True:
|
||||
response = self._session.post(
|
||||
GongConnector.make_url("/v2/calls/transcript"), json=body
|
||||
response = requests.post(
|
||||
url, headers=self._get_auth_header(), json=body
|
||||
)
|
||||
# If no calls in the range, just break out
|
||||
if response.status_code == 404:
|
||||
@@ -136,14 +125,14 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
yield transcripts
|
||||
|
||||
def _get_call_details_by_ids(self, call_ids: list[str]) -> dict:
|
||||
url = f"{GONG_BASE_URL}/v2/calls/extensive"
|
||||
|
||||
body = {
|
||||
"filter": {"callIds": call_ids},
|
||||
"contentSelector": {"exposedFields": {"parties": True}},
|
||||
}
|
||||
|
||||
response = self._session.post(
|
||||
GongConnector.make_url("/v2/calls/extensive"), json=body
|
||||
)
|
||||
response = requests.post(url, headers=self._get_auth_header(), json=body)
|
||||
response.raise_for_status()
|
||||
|
||||
calls = response.json().get("calls")
|
||||
@@ -176,74 +165,24 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
def _fetch_calls(
|
||||
self, start_datetime: str | None = None, end_datetime: str | None = None
|
||||
) -> GenerateDocumentsOutput:
|
||||
num_calls = 0
|
||||
|
||||
for transcript_batch in self._get_transcript_batches(
|
||||
start_datetime, end_datetime
|
||||
):
|
||||
doc_batch: list[Document] = []
|
||||
|
||||
transcript_call_ids = cast(
|
||||
call_ids = cast(
|
||||
list[str],
|
||||
[t.get("callId") for t in transcript_batch if t.get("callId")],
|
||||
)
|
||||
call_details_map = self._get_call_details_by_ids(call_ids)
|
||||
|
||||
call_details_map: dict[str, Any] = {}
|
||||
|
||||
# There's a likely race condition in the API where a transcript will have a
|
||||
# call id but the call to v2/calls/extensive will not return all of the id's
|
||||
# retry with exponential backoff has been observed to mitigate this
|
||||
# in ~2 minutes
|
||||
current_attempt = 0
|
||||
while True:
|
||||
current_attempt += 1
|
||||
call_details_map = self._get_call_details_by_ids(transcript_call_ids)
|
||||
if set(transcript_call_ids) == set(call_details_map.keys()):
|
||||
# we got all the id's we were expecting ... break and continue
|
||||
break
|
||||
|
||||
# we are missing some id's. Log and retry with exponential backoff
|
||||
missing_call_ids = set(transcript_call_ids) - set(
|
||||
call_details_map.keys()
|
||||
)
|
||||
logger.warning(
|
||||
f"_get_call_details_by_ids is missing call id's: "
|
||||
f"current_attempt={current_attempt} "
|
||||
f"missing_call_ids={missing_call_ids}"
|
||||
)
|
||||
if current_attempt >= self.MAX_CALL_DETAILS_ATTEMPTS:
|
||||
raise RuntimeError(
|
||||
f"Attempt count exceeded for _get_call_details_by_ids: "
|
||||
f"missing_call_ids={missing_call_ids} "
|
||||
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
|
||||
)
|
||||
|
||||
wait_seconds = self.CALL_DETAILS_DELAY * pow(2, current_attempt - 1)
|
||||
logger.warning(
|
||||
f"_get_call_details_by_ids waiting to retry: "
|
||||
f"wait={wait_seconds}s "
|
||||
f"current_attempt={current_attempt} "
|
||||
f"next_attempt={current_attempt+1} "
|
||||
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
|
||||
)
|
||||
time.sleep(wait_seconds)
|
||||
|
||||
# now we can iterate per call/transcript
|
||||
for transcript in transcript_batch:
|
||||
call_id = transcript.get("callId")
|
||||
|
||||
if not call_id or call_id not in call_details_map:
|
||||
# NOTE(rkuo): seeing odd behavior where call_ids from the transcript
|
||||
# don't have call details. adding error debugging logs to trace.
|
||||
logger.error(
|
||||
f"Couldn't get call information for Call ID: {call_id}"
|
||||
)
|
||||
if call_id:
|
||||
logger.error(
|
||||
f"Call debug info: call_id={call_id} "
|
||||
f"call_ids={transcript_call_ids} "
|
||||
f"call_details_map={call_details_map.keys()}"
|
||||
)
|
||||
if not self.continue_on_fail:
|
||||
raise RuntimeError(
|
||||
f"Couldn't get call information for Call ID: {call_id}"
|
||||
@@ -256,8 +195,7 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
call_time_str = call_metadata["started"]
|
||||
call_title = call_metadata["title"]
|
||||
logger.info(
|
||||
f"{num_calls+1}: Indexing Gong call id {call_id} "
|
||||
f"from {call_time_str.split('T', 1)[0]}: {call_title}"
|
||||
f"Indexing Gong call from {call_time_str.split('T', 1)[0]}: {call_title}"
|
||||
)
|
||||
|
||||
call_parties = cast(list[dict] | None, call_details.get("parties"))
|
||||
@@ -316,13 +254,8 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
metadata={"client": call_metadata.get("system")},
|
||||
)
|
||||
)
|
||||
|
||||
num_calls += 1
|
||||
|
||||
yield doc_batch
|
||||
|
||||
logger.info(f"_fetch_calls finished: num_calls={num_calls}")
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
combined = (
|
||||
f'{credentials["gong_access_key"]}:{credentials["gong_access_key_secret"]}'
|
||||
@@ -330,13 +263,6 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
self.auth_token_basic = base64.b64encode(combined.encode("utf-8")).decode(
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
if self.auth_token_basic is None:
|
||||
raise ConnectorMissingCredentialError("Gong")
|
||||
|
||||
self._session.headers.update(
|
||||
{"Authorization": f"Basic {self.auth_token_basic}"}
|
||||
)
|
||||
return None
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
|
||||
@@ -20,8 +20,7 @@ from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
|
||||
from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
|
||||
from onyx.file_processing.extract_file_text import ALL_ACCEPTED_FILE_EXTENSIONS
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
@@ -85,21 +84,14 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
Populate the spot ID map with all available spots.
|
||||
Keys are stored as lowercase for case-insensitive lookups.
|
||||
"""
|
||||
try:
|
||||
spots = self.client.get_spots()
|
||||
for spot in spots:
|
||||
if "title" in spot and "id" in spot:
|
||||
spot_name = spot["title"]
|
||||
self._spot_id_map[spot_name.lower()] = spot["id"]
|
||||
spots = self.client.get_spots()
|
||||
for spot in spots:
|
||||
if "title" in spot and "id" in spot:
|
||||
spot_name = spot["title"]
|
||||
self._spot_id_map[spot_name.lower()] = spot["id"]
|
||||
|
||||
self._all_spots_fetched = True
|
||||
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
|
||||
except HighspotClientError as e:
|
||||
logger.error(f"Error retrieving spots from Highspot: {str(e)}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error retrieving spots from Highspot: {str(e)}")
|
||||
raise
|
||||
self._all_spots_fetched = True
|
||||
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
|
||||
|
||||
def _get_all_spot_names(self) -> List[str]:
|
||||
"""
|
||||
@@ -159,142 +151,116 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
Batches of Document objects
|
||||
"""
|
||||
doc_batch: list[Document] = []
|
||||
try:
|
||||
# If no spots specified, get all spots
|
||||
spot_names_to_process = self.spot_names
|
||||
if not spot_names_to_process:
|
||||
spot_names_to_process = self._get_all_spot_names()
|
||||
if not spot_names_to_process:
|
||||
logger.warning("No spots found in Highspot")
|
||||
raise ValueError("No spots found in Highspot")
|
||||
logger.info(
|
||||
f"No spots specified, using all {len(spot_names_to_process)} available spots"
|
||||
)
|
||||
|
||||
for spot_name in spot_names_to_process:
|
||||
try:
|
||||
spot_id = self._get_spot_id_from_name(spot_name)
|
||||
if spot_id is None:
|
||||
logger.warning(f"Spot ID not found for spot {spot_name}")
|
||||
continue
|
||||
offset = 0
|
||||
has_more = True
|
||||
# If no spots specified, get all spots
|
||||
spot_names_to_process = self.spot_names
|
||||
if not spot_names_to_process:
|
||||
spot_names_to_process = self._get_all_spot_names()
|
||||
logger.info(
|
||||
f"No spots specified, using all {len(spot_names_to_process)} available spots"
|
||||
)
|
||||
|
||||
while has_more:
|
||||
logger.info(
|
||||
f"Retrieving items from spot {spot_name}, offset {offset}"
|
||||
)
|
||||
response = self.client.get_spot_items(
|
||||
spot_id=spot_id, offset=offset, page_size=self.batch_size
|
||||
)
|
||||
items = response.get("collection", [])
|
||||
logger.info(f"Received Items: {items}")
|
||||
if not items:
|
||||
has_more = False
|
||||
continue
|
||||
for spot_name in spot_names_to_process:
|
||||
try:
|
||||
spot_id = self._get_spot_id_from_name(spot_name)
|
||||
if spot_id is None:
|
||||
logger.warning(f"Spot ID not found for spot {spot_name}")
|
||||
continue
|
||||
offset = 0
|
||||
has_more = True
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
item_id = item.get("id")
|
||||
if not item_id:
|
||||
logger.warning("Item without ID found, skipping")
|
||||
continue
|
||||
|
||||
item_details = self.client.get_item(item_id)
|
||||
if not item_details:
|
||||
logger.warning(
|
||||
f"Item {item_id} details not found, skipping"
|
||||
)
|
||||
continue
|
||||
# Apply time filter if specified
|
||||
if start or end:
|
||||
updated_at = item_details.get("date_updated")
|
||||
if updated_at:
|
||||
# Convert to datetime for comparison
|
||||
try:
|
||||
updated_time = datetime.fromisoformat(
|
||||
updated_at.replace("Z", "+00:00")
|
||||
)
|
||||
if (
|
||||
start
|
||||
and updated_time.timestamp() < start
|
||||
) or (
|
||||
end and updated_time.timestamp() > end
|
||||
):
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
# Skip if date cannot be parsed
|
||||
logger.warning(
|
||||
f"Invalid date format for item {item_id}: {updated_at}"
|
||||
)
|
||||
continue
|
||||
|
||||
content = self._get_item_content(item_details)
|
||||
|
||||
title = item_details.get("title", "")
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=f"HIGHSPOT_{item_id}",
|
||||
sections=[
|
||||
TextSection(
|
||||
link=item_details.get(
|
||||
"url",
|
||||
f"https://www.highspot.com/items/{item_id}",
|
||||
),
|
||||
text=content,
|
||||
)
|
||||
],
|
||||
source=DocumentSource.HIGHSPOT,
|
||||
semantic_identifier=title,
|
||||
metadata={
|
||||
"spot_name": spot_name,
|
||||
"type": item_details.get(
|
||||
"content_type", ""
|
||||
),
|
||||
"created_at": item_details.get(
|
||||
"date_added", ""
|
||||
),
|
||||
"author": item_details.get("author", ""),
|
||||
"language": item_details.get(
|
||||
"language", ""
|
||||
),
|
||||
"can_download": str(
|
||||
item_details.get("can_download", False)
|
||||
),
|
||||
},
|
||||
doc_updated_at=item_details.get("date_updated"),
|
||||
)
|
||||
)
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
|
||||
except HighspotClientError as e:
|
||||
item_id = "ID" if not item_id else item_id
|
||||
logger.error(
|
||||
f"Error retrieving item {item_id}: {str(e)}"
|
||||
)
|
||||
except Exception as e:
|
||||
item_id = "ID" if not item_id else item_id
|
||||
logger.error(
|
||||
f"Unexpected error for item {item_id}: {str(e)}"
|
||||
)
|
||||
|
||||
has_more = len(items) >= self.batch_size
|
||||
offset += self.batch_size
|
||||
|
||||
except (HighspotClientError, ValueError) as e:
|
||||
logger.error(f"Error processing spot {spot_name}: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Unexpected error processing spot {spot_name}: {str(e)}"
|
||||
while has_more:
|
||||
logger.info(
|
||||
f"Retrieving items from spot {spot_name}, offset {offset}"
|
||||
)
|
||||
response = self.client.get_spot_items(
|
||||
spot_id=spot_id, offset=offset, page_size=self.batch_size
|
||||
)
|
||||
items = response.get("collection", [])
|
||||
logger.info(f"Received Items: {items}")
|
||||
if not items:
|
||||
has_more = False
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Highspot connector: {str(e)}")
|
||||
raise
|
||||
for item in items:
|
||||
try:
|
||||
item_id = item.get("id")
|
||||
if not item_id:
|
||||
logger.warning("Item without ID found, skipping")
|
||||
continue
|
||||
|
||||
item_details = self.client.get_item(item_id)
|
||||
if not item_details:
|
||||
logger.warning(
|
||||
f"Item {item_id} details not found, skipping"
|
||||
)
|
||||
continue
|
||||
# Apply time filter if specified
|
||||
if start or end:
|
||||
updated_at = item_details.get("date_updated")
|
||||
if updated_at:
|
||||
# Convert to datetime for comparison
|
||||
try:
|
||||
updated_time = datetime.fromisoformat(
|
||||
updated_at.replace("Z", "+00:00")
|
||||
)
|
||||
if (
|
||||
start and updated_time.timestamp() < start
|
||||
) or (end and updated_time.timestamp() > end):
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
# Skip if date cannot be parsed
|
||||
logger.warning(
|
||||
f"Invalid date format for item {item_id}: {updated_at}"
|
||||
)
|
||||
continue
|
||||
|
||||
content = self._get_item_content(item_details)
|
||||
title = item_details.get("title", "")
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=f"HIGHSPOT_{item_id}",
|
||||
sections=[
|
||||
TextSection(
|
||||
link=item_details.get(
|
||||
"url",
|
||||
f"https://www.highspot.com/items/{item_id}",
|
||||
),
|
||||
text=content,
|
||||
)
|
||||
],
|
||||
source=DocumentSource.HIGHSPOT,
|
||||
semantic_identifier=title,
|
||||
metadata={
|
||||
"spot_name": spot_name,
|
||||
"type": item_details.get("content_type", ""),
|
||||
"created_at": item_details.get(
|
||||
"date_added", ""
|
||||
),
|
||||
"author": item_details.get("author", ""),
|
||||
"language": item_details.get("language", ""),
|
||||
"can_download": str(
|
||||
item_details.get("can_download", False)
|
||||
),
|
||||
},
|
||||
doc_updated_at=item_details.get("date_updated"),
|
||||
)
|
||||
)
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
|
||||
except HighspotClientError as e:
|
||||
item_id = "ID" if not item_id else item_id
|
||||
logger.error(f"Error retrieving item {item_id}: {str(e)}")
|
||||
|
||||
has_more = len(items) >= self.batch_size
|
||||
offset += self.batch_size
|
||||
|
||||
except (HighspotClientError, ValueError) as e:
|
||||
logger.error(f"Error processing spot {spot_name}: {str(e)}")
|
||||
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
@@ -320,9 +286,7 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
# Extract title and description once at the beginning
|
||||
title, description = self._extract_title_and_description(item_details)
|
||||
default_content = f"{title}\n{description}"
|
||||
logger.info(
|
||||
f"Processing item {item_id} with extension {file_extension} and file name {content_name}"
|
||||
)
|
||||
logger.info(f"Processing item {item_id} with extension {file_extension}")
|
||||
|
||||
try:
|
||||
if content_type == "WebLink":
|
||||
@@ -334,39 +298,30 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
|
||||
elif (
|
||||
is_valid_format
|
||||
and (
|
||||
file_extension in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
|
||||
or file_extension in ACCEPTED_DOCUMENT_FILE_EXTENSIONS
|
||||
)
|
||||
and file_extension in ALL_ACCEPTED_FILE_EXTENSIONS
|
||||
and can_download
|
||||
):
|
||||
# For documents, try to get the text content
|
||||
if not item_id: # Ensure item_id is defined
|
||||
return default_content
|
||||
|
||||
content_response = self.client.get_item_content(item_id)
|
||||
# Process and extract text from binary content based on type
|
||||
if content_response:
|
||||
text_content = extract_file_text(
|
||||
BytesIO(content_response), content_name, False
|
||||
BytesIO(content_response), content_name
|
||||
)
|
||||
return text_content if text_content else default_content
|
||||
return text_content
|
||||
return default_content
|
||||
|
||||
else:
|
||||
return default_content
|
||||
|
||||
except HighspotClientError as e:
|
||||
error_context = f"item {item_id}" if item_id else "(item id not found)"
|
||||
# Use item_id safely in the warning message
|
||||
error_context = f"item {item_id}" if item_id else "item"
|
||||
logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
|
||||
return default_content
|
||||
except ValueError as e:
|
||||
error_context = f"item {item_id}" if item_id else "(item id not found)"
|
||||
logger.error(f"Value error for {error_context}: {str(e)}")
|
||||
return default_content
|
||||
|
||||
except Exception as e:
|
||||
error_context = f"item {item_id}" if item_id else "(item id not found)"
|
||||
logger.error(
|
||||
f"Unexpected error retrieving content for {error_context}: {str(e)}"
|
||||
)
|
||||
return default_content
|
||||
return ""
|
||||
|
||||
def _extract_title_and_description(
|
||||
self, item_details: Dict[str, Any]
|
||||
@@ -403,63 +358,55 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
Batches of SlimDocument objects
|
||||
"""
|
||||
slim_doc_batch: list[SlimDocument] = []
|
||||
try:
|
||||
# If no spots specified, get all spots
|
||||
spot_names_to_process = self.spot_names
|
||||
if not spot_names_to_process:
|
||||
spot_names_to_process = self._get_all_spot_names()
|
||||
if not spot_names_to_process:
|
||||
logger.warning("No spots found in Highspot")
|
||||
raise ValueError("No spots found in Highspot")
|
||||
logger.info(
|
||||
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
|
||||
)
|
||||
|
||||
for spot_name in spot_names_to_process:
|
||||
try:
|
||||
spot_id = self._get_spot_id_from_name(spot_name)
|
||||
offset = 0
|
||||
has_more = True
|
||||
# If no spots specified, get all spots
|
||||
spot_names_to_process = self.spot_names
|
||||
if not spot_names_to_process:
|
||||
spot_names_to_process = self._get_all_spot_names()
|
||||
logger.info(
|
||||
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
|
||||
)
|
||||
|
||||
while has_more:
|
||||
logger.info(
|
||||
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
|
||||
)
|
||||
response = self.client.get_spot_items(
|
||||
spot_id=spot_id, offset=offset, page_size=self.batch_size
|
||||
)
|
||||
for spot_name in spot_names_to_process:
|
||||
try:
|
||||
spot_id = self._get_spot_id_from_name(spot_name)
|
||||
offset = 0
|
||||
has_more = True
|
||||
|
||||
items = response.get("collection", [])
|
||||
if not items:
|
||||
has_more = False
|
||||
continue
|
||||
|
||||
for item in items:
|
||||
item_id = item.get("id")
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
slim_doc_batch.append(
|
||||
SlimDocument(id=f"HIGHSPOT_{item_id}")
|
||||
)
|
||||
|
||||
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
|
||||
yield slim_doc_batch
|
||||
slim_doc_batch = []
|
||||
|
||||
has_more = len(items) >= self.batch_size
|
||||
offset += self.batch_size
|
||||
|
||||
except (HighspotClientError, ValueError) as e:
|
||||
logger.error(
|
||||
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
|
||||
while has_more:
|
||||
logger.info(
|
||||
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
|
||||
)
|
||||
response = self.client.get_spot_items(
|
||||
spot_id=spot_id, offset=offset, page_size=self.batch_size
|
||||
)
|
||||
|
||||
if slim_doc_batch:
|
||||
yield slim_doc_batch
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Highspot Slim Connector: {str(e)}")
|
||||
raise
|
||||
items = response.get("collection", [])
|
||||
if not items:
|
||||
has_more = False
|
||||
continue
|
||||
|
||||
for item in items:
|
||||
item_id = item.get("id")
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
|
||||
|
||||
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
|
||||
yield slim_doc_batch
|
||||
slim_doc_batch = []
|
||||
|
||||
has_more = len(items) >= self.batch_size
|
||||
offset += self.batch_size
|
||||
|
||||
except (HighspotClientError, ValueError) as e:
|
||||
logger.error(
|
||||
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
|
||||
)
|
||||
|
||||
if slim_doc_batch:
|
||||
yield slim_doc_batch
|
||||
|
||||
def validate_credentials(self) -> bool:
|
||||
"""
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
@@ -41,9 +40,6 @@ class TextSection(Section):
|
||||
text: str
|
||||
link: str | None = None
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
return sys.getsizeof(self.text) + sys.getsizeof(self.link)
|
||||
|
||||
|
||||
class ImageSection(Section):
|
||||
"""Section containing an image reference"""
|
||||
@@ -51,9 +47,6 @@ class ImageSection(Section):
|
||||
image_file_name: str
|
||||
link: str | None = None
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)
|
||||
|
||||
|
||||
class BasicExpertInfo(BaseModel):
|
||||
"""Basic Information for the owner of a document, any of the fields can be left as None
|
||||
@@ -117,14 +110,6 @@ class BasicExpertInfo(BaseModel):
|
||||
)
|
||||
)
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
size = sys.getsizeof(self.display_name)
|
||||
size += sys.getsizeof(self.first_name)
|
||||
size += sys.getsizeof(self.middle_initial)
|
||||
size += sys.getsizeof(self.last_name)
|
||||
size += sys.getsizeof(self.email)
|
||||
return size
|
||||
|
||||
|
||||
class DocumentBase(BaseModel):
|
||||
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
|
||||
@@ -178,32 +163,6 @@ class DocumentBase(BaseModel):
|
||||
attributes.append(k + INDEX_SEPARATOR + v)
|
||||
return attributes
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
size = sys.getsizeof(self.id)
|
||||
for section in self.sections:
|
||||
size += sys.getsizeof(section)
|
||||
size += sys.getsizeof(self.source)
|
||||
size += sys.getsizeof(self.semantic_identifier)
|
||||
size += sys.getsizeof(self.doc_updated_at)
|
||||
size += sys.getsizeof(self.chunk_count)
|
||||
|
||||
if self.primary_owners is not None:
|
||||
for primary_owner in self.primary_owners:
|
||||
size += sys.getsizeof(primary_owner)
|
||||
else:
|
||||
size += sys.getsizeof(self.primary_owners)
|
||||
|
||||
if self.secondary_owners is not None:
|
||||
for secondary_owner in self.secondary_owners:
|
||||
size += sys.getsizeof(secondary_owner)
|
||||
else:
|
||||
size += sys.getsizeof(self.secondary_owners)
|
||||
|
||||
size += sys.getsizeof(self.title)
|
||||
size += sys.getsizeof(self.from_ingestion_api)
|
||||
size += sys.getsizeof(self.additional_info)
|
||||
return size
|
||||
|
||||
def get_text_content(self) -> str:
|
||||
return " ".join([section.text for section in self.sections if section.text])
|
||||
|
||||
@@ -235,12 +194,6 @@ class Document(DocumentBase):
|
||||
from_ingestion_api=base.from_ingestion_api,
|
||||
)
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
size = super().__sizeof__()
|
||||
size += sys.getsizeof(self.id)
|
||||
size += sys.getsizeof(self.source)
|
||||
return size
|
||||
|
||||
|
||||
class IndexingDocument(Document):
|
||||
"""Document with processed sections for indexing"""
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from simple_salesforce import Salesforce
|
||||
@@ -26,13 +21,9 @@ from onyx.connectors.salesforce.salesforce_calls import get_all_children_of_sf_t
|
||||
from onyx.connectors.salesforce.sqlite_functions import get_affected_parent_ids_by_type
|
||||
from onyx.connectors.salesforce.sqlite_functions import get_record
|
||||
from onyx.connectors.salesforce.sqlite_functions import init_db
|
||||
from onyx.connectors.salesforce.sqlite_functions import sqlite_log_stats
|
||||
from onyx.connectors.salesforce.sqlite_functions import update_sf_db_with_csv
|
||||
from onyx.connectors.salesforce.utils import BASE_DATA_PATH
|
||||
from onyx.connectors.salesforce.utils import get_sqlite_db_path
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -41,8 +32,6 @@ _DEFAULT_PARENT_OBJECT_TYPES = ["Account"]
|
||||
|
||||
|
||||
class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
MAX_BATCH_BYTES = 1024 * 1024
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
@@ -75,45 +64,22 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
raise ConnectorMissingCredentialError("Salesforce")
|
||||
return self._sf_client
|
||||
|
||||
@staticmethod
|
||||
def reconstruct_object_types(directory: str) -> dict[str, list[str] | None]:
|
||||
"""
|
||||
Scans the given directory for all CSV files and reconstructs the available object types.
|
||||
Assumes filenames are formatted as "ObjectType.filename.csv" or "ObjectType.csv".
|
||||
|
||||
Args:
|
||||
directory (str): The path to the directory containing CSV files.
|
||||
|
||||
Returns:
|
||||
dict[str, list[str]]: A dictionary mapping object types to lists of file paths.
|
||||
"""
|
||||
object_types = defaultdict(list)
|
||||
|
||||
for filename in os.listdir(directory):
|
||||
if filename.endswith(".csv"):
|
||||
parts = filename.split(".", 1) # Split on the first period
|
||||
object_type = parts[0] # Take the first part as the object type
|
||||
object_types[object_type].append(os.path.join(directory, filename))
|
||||
|
||||
return dict(object_types)
|
||||
|
||||
@staticmethod
|
||||
def _download_object_csvs(
|
||||
directory: str,
|
||||
parent_object_list: list[str],
|
||||
sf_client: Salesforce,
|
||||
def _fetch_from_salesforce(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> None:
|
||||
all_object_types: set[str] = set(parent_object_list)
|
||||
) -> GenerateDocumentsOutput:
|
||||
init_db()
|
||||
all_object_types: set[str] = set(self.parent_object_list)
|
||||
|
||||
logger.info(
|
||||
f"Parent object types: num={len(parent_object_list)} list={parent_object_list}"
|
||||
)
|
||||
logger.info(f"Starting with {len(self.parent_object_list)} parent object types")
|
||||
logger.debug(f"Parent object types: {self.parent_object_list}")
|
||||
|
||||
# This takes like 20 seconds
|
||||
for parent_object_type in parent_object_list:
|
||||
child_types = get_all_children_of_sf_type(sf_client, parent_object_type)
|
||||
for parent_object_type in self.parent_object_list:
|
||||
child_types = get_all_children_of_sf_type(
|
||||
self.sf_client, parent_object_type
|
||||
)
|
||||
all_object_types.update(child_types)
|
||||
logger.debug(
|
||||
f"Found {len(child_types)} child types for {parent_object_type}"
|
||||
@@ -122,53 +88,20 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
# Always want to make sure user is grabbed for permissioning purposes
|
||||
all_object_types.add("User")
|
||||
|
||||
logger.info(
|
||||
f"All object types: num={len(all_object_types)} list={all_object_types}"
|
||||
)
|
||||
|
||||
# gc.collect()
|
||||
logger.info(f"Found total of {len(all_object_types)} object types to fetch")
|
||||
logger.debug(f"All object types: {all_object_types}")
|
||||
|
||||
# checkpoint - we've found all object types, now time to fetch the data
|
||||
logger.info("Fetching CSVs for all object types")
|
||||
|
||||
logger.info("Starting to fetch CSVs for all object types")
|
||||
# This takes like 30 minutes first time and <2 minutes for updates
|
||||
object_type_to_csv_path = fetch_all_csvs_in_parallel(
|
||||
sf_client=sf_client,
|
||||
sf_client=self.sf_client,
|
||||
object_types=all_object_types,
|
||||
start=start,
|
||||
end=end,
|
||||
target_dir=directory,
|
||||
)
|
||||
|
||||
# print useful information
|
||||
num_csvs = 0
|
||||
num_bytes = 0
|
||||
for object_type, csv_paths in object_type_to_csv_path.items():
|
||||
if not csv_paths:
|
||||
continue
|
||||
|
||||
for csv_path in csv_paths:
|
||||
if not csv_path:
|
||||
continue
|
||||
|
||||
file_path = Path(csv_path)
|
||||
file_size = file_path.stat().st_size
|
||||
num_csvs += 1
|
||||
num_bytes += file_size
|
||||
logger.info(
|
||||
f"CSV info: object_type={object_type} path={csv_path} bytes={file_size}"
|
||||
)
|
||||
|
||||
logger.info(f"CSV info total: total_csvs={num_csvs} total_bytes={num_bytes}")
|
||||
|
||||
@staticmethod
|
||||
def _load_csvs_to_db(csv_directory: str, db_directory: str) -> set[str]:
|
||||
updated_ids: set[str] = set()
|
||||
|
||||
object_type_to_csv_path = SalesforceConnector.reconstruct_object_types(
|
||||
csv_directory
|
||||
)
|
||||
|
||||
# This takes like 10 seconds
|
||||
# This is for testing the rest of the functionality if data has
|
||||
# already been fetched and put in sqlite
|
||||
@@ -187,16 +120,10 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
# If path is None, it means it failed to fetch the csv
|
||||
if csv_paths is None:
|
||||
continue
|
||||
|
||||
# Go through each csv path and use it to update the db
|
||||
for csv_path in csv_paths:
|
||||
logger.debug(
|
||||
f"Processing CSV: object_type={object_type} "
|
||||
f"csv={csv_path} "
|
||||
f"len={Path(csv_path).stat().st_size}"
|
||||
)
|
||||
logger.debug(f"Updating {object_type} with {csv_path}")
|
||||
new_ids = update_sf_db_with_csv(
|
||||
db_directory,
|
||||
object_type=object_type,
|
||||
csv_download_path=csv_path,
|
||||
)
|
||||
@@ -205,127 +132,49 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
f"Added {len(new_ids)} new/updated records for {object_type}"
|
||||
)
|
||||
|
||||
os.remove(csv_path)
|
||||
|
||||
return updated_ids
|
||||
|
||||
def _fetch_from_salesforce(
|
||||
self,
|
||||
temp_dir: str,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateDocumentsOutput:
|
||||
logger.info("_fetch_from_salesforce starting.")
|
||||
if not self._sf_client:
|
||||
raise RuntimeError("self._sf_client is None!")
|
||||
|
||||
init_db(temp_dir)
|
||||
|
||||
sqlite_log_stats(temp_dir)
|
||||
|
||||
# Step 1 - download
|
||||
SalesforceConnector._download_object_csvs(
|
||||
temp_dir, self.parent_object_list, self._sf_client, start, end
|
||||
)
|
||||
gc.collect()
|
||||
|
||||
# Step 2 - load CSV's to sqlite
|
||||
updated_ids = SalesforceConnector._load_csvs_to_db(temp_dir, temp_dir)
|
||||
gc.collect()
|
||||
|
||||
logger.info(f"Found {len(updated_ids)} total updated records")
|
||||
logger.info(
|
||||
f"Starting to process parent objects of types: {self.parent_object_list}"
|
||||
)
|
||||
|
||||
# Step 3 - extract and index docs
|
||||
batches_processed = 0
|
||||
docs_processed = 0
|
||||
docs_to_yield: list[Document] = []
|
||||
docs_to_yield_bytes = 0
|
||||
|
||||
docs_processed = 0
|
||||
# Takes 15-20 seconds per batch
|
||||
for parent_type, parent_id_batch in get_affected_parent_ids_by_type(
|
||||
temp_dir,
|
||||
updated_ids=list(updated_ids),
|
||||
parent_types=self.parent_object_list,
|
||||
):
|
||||
batches_processed += 1
|
||||
logger.info(
|
||||
f"Processing batch: index={batches_processed} "
|
||||
f"object_type={parent_type} "
|
||||
f"len={len(parent_id_batch)} "
|
||||
f"processed={docs_processed} "
|
||||
f"remaining={len(updated_ids) - docs_processed}"
|
||||
f"Processing batch of {len(parent_id_batch)} {parent_type} objects"
|
||||
)
|
||||
for parent_id in parent_id_batch:
|
||||
if not (parent_object := get_record(temp_dir, parent_id, parent_type)):
|
||||
if not (parent_object := get_record(parent_id, parent_type)):
|
||||
logger.warning(
|
||||
f"Failed to get parent object {parent_id} for {parent_type}"
|
||||
)
|
||||
continue
|
||||
|
||||
doc = convert_sf_object_to_doc(
|
||||
temp_dir,
|
||||
sf_object=parent_object,
|
||||
sf_instance=self.sf_client.sf_instance,
|
||||
docs_to_yield.append(
|
||||
convert_sf_object_to_doc(
|
||||
sf_object=parent_object,
|
||||
sf_instance=self.sf_client.sf_instance,
|
||||
)
|
||||
)
|
||||
doc_sizeof = sys.getsizeof(doc)
|
||||
docs_to_yield_bytes += doc_sizeof
|
||||
docs_to_yield.append(doc)
|
||||
docs_processed += 1
|
||||
|
||||
# memory usage is sensitive to the input length, so we're yielding immediately
|
||||
# if the batch exceeds a certain byte length
|
||||
if (
|
||||
len(docs_to_yield) >= self.batch_size
|
||||
or docs_to_yield_bytes > SalesforceConnector.MAX_BATCH_BYTES
|
||||
):
|
||||
if len(docs_to_yield) >= self.batch_size:
|
||||
yield docs_to_yield
|
||||
docs_to_yield = []
|
||||
docs_to_yield_bytes = 0
|
||||
|
||||
# observed a memory leak / size issue with the account table if we don't gc.collect here.
|
||||
gc.collect()
|
||||
|
||||
yield docs_to_yield
|
||||
logger.info(
|
||||
f"Final processing stats: "
|
||||
f"processed={docs_processed} "
|
||||
f"remaining={len(updated_ids) - docs_processed}"
|
||||
)
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
if MULTI_TENANT:
|
||||
# if multi tenant, we cannot expect the sqlite db to be cached/present
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
return self._fetch_from_salesforce(temp_dir)
|
||||
|
||||
# nuke the db since we're starting from scratch
|
||||
sqlite_db_path = get_sqlite_db_path(BASE_DATA_PATH)
|
||||
if os.path.exists(sqlite_db_path):
|
||||
logger.info(f"load_from_state: Removing db at {sqlite_db_path}.")
|
||||
os.remove(sqlite_db_path)
|
||||
return self._fetch_from_salesforce(BASE_DATA_PATH)
|
||||
return self._fetch_from_salesforce()
|
||||
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
if MULTI_TENANT:
|
||||
# if multi tenant, we cannot expect the sqlite db to be cached/present
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
return self._fetch_from_salesforce(temp_dir, start=start, end=end)
|
||||
|
||||
if start == 0:
|
||||
# nuke the db if we're starting from scratch
|
||||
sqlite_db_path = get_sqlite_db_path(BASE_DATA_PATH)
|
||||
if os.path.exists(sqlite_db_path):
|
||||
logger.info(
|
||||
f"poll_source: Starting at time 0, removing db at {sqlite_db_path}."
|
||||
)
|
||||
os.remove(sqlite_db_path)
|
||||
|
||||
return self._fetch_from_salesforce(BASE_DATA_PATH)
|
||||
return self._fetch_from_salesforce(start=start, end=end)
|
||||
|
||||
def retrieve_all_slim_documents(
|
||||
self,
|
||||
@@ -360,7 +209,7 @@ if __name__ == "__main__":
|
||||
"sf_security_token": os.environ["SF_SECURITY_TOKEN"],
|
||||
}
|
||||
)
|
||||
start_time = time.monotonic()
|
||||
start_time = time.time()
|
||||
doc_count = 0
|
||||
section_count = 0
|
||||
text_count = 0
|
||||
@@ -372,7 +221,7 @@ if __name__ == "__main__":
|
||||
for section in doc.sections:
|
||||
if isinstance(section, TextSection) and section.text is not None:
|
||||
text_count += len(section.text)
|
||||
end_time = time.monotonic()
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Doc count: {doc_count}")
|
||||
print(f"Section count: {section_count}")
|
||||
|
||||
@@ -124,14 +124,13 @@ def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> Text
|
||||
|
||||
|
||||
def _extract_primary_owners(
|
||||
directory: str,
|
||||
sf_object: SalesforceObject,
|
||||
) -> list[BasicExpertInfo] | None:
|
||||
object_dict = sf_object.data
|
||||
if not (last_modified_by_id := object_dict.get("LastModifiedById")):
|
||||
logger.warning(f"No LastModifiedById found for {sf_object.id}")
|
||||
return None
|
||||
if not (last_modified_by := get_record(directory, last_modified_by_id)):
|
||||
if not (last_modified_by := get_record(last_modified_by_id)):
|
||||
logger.warning(f"No LastModifiedBy found for {last_modified_by_id}")
|
||||
return None
|
||||
|
||||
@@ -160,7 +159,6 @@ def _extract_primary_owners(
|
||||
|
||||
|
||||
def convert_sf_object_to_doc(
|
||||
directory: str,
|
||||
sf_object: SalesforceObject,
|
||||
sf_instance: str,
|
||||
) -> Document:
|
||||
@@ -172,8 +170,8 @@ def convert_sf_object_to_doc(
|
||||
extracted_semantic_identifier = object_dict.get("Name", "Unknown Object")
|
||||
|
||||
sections = [_extract_section(sf_object, base_url)]
|
||||
for id in get_child_ids(directory, sf_object.id):
|
||||
if not (child_object := get_record(directory, id)):
|
||||
for id in get_child_ids(sf_object.id):
|
||||
if not (child_object := get_record(id)):
|
||||
continue
|
||||
sections.append(_extract_section(child_object, base_url))
|
||||
|
||||
@@ -183,7 +181,7 @@ def convert_sf_object_to_doc(
|
||||
source=DocumentSource.SALESFORCE,
|
||||
semantic_identifier=extracted_semantic_identifier,
|
||||
doc_updated_at=extracted_doc_updated_at,
|
||||
primary_owners=_extract_primary_owners(directory, sf_object),
|
||||
primary_owners=_extract_primary_owners(sf_object),
|
||||
metadata={},
|
||||
)
|
||||
return doc
|
||||
|
||||
@@ -11,12 +11,13 @@ from simple_salesforce.bulk2 import SFBulk2Type
|
||||
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.salesforce.sqlite_functions import has_at_least_one_object_of_type
|
||||
from onyx.connectors.salesforce.utils import get_object_type_path
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _build_last_modified_time_filter_for_salesforce(
|
||||
def _build_time_filter_for_salesforce(
|
||||
start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
||||
) -> str:
|
||||
if start is None or end is None:
|
||||
@@ -29,19 +30,6 @@ def _build_last_modified_time_filter_for_salesforce(
|
||||
)
|
||||
|
||||
|
||||
def _build_created_date_time_filter_for_salesforce(
|
||||
start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
||||
) -> str:
|
||||
if start is None or end is None:
|
||||
return ""
|
||||
start_datetime = datetime.fromtimestamp(start, UTC)
|
||||
end_datetime = datetime.fromtimestamp(end, UTC)
|
||||
return (
|
||||
f" WHERE CreatedDate > {start_datetime.isoformat()} "
|
||||
f"AND CreatedDate < {end_datetime.isoformat()}"
|
||||
)
|
||||
|
||||
|
||||
def _get_sf_type_object_json(sf_client: Salesforce, type_name: str) -> Any:
|
||||
sf_object = SFType(type_name, sf_client.session_id, sf_client.sf_instance)
|
||||
return sf_object.describe()
|
||||
@@ -121,6 +109,23 @@ def _check_if_object_type_is_empty(
|
||||
return True
|
||||
|
||||
|
||||
def _check_for_existing_csvs(sf_type: str) -> list[str] | None:
|
||||
# Check if the csv already exists
|
||||
if os.path.exists(get_object_type_path(sf_type)):
|
||||
existing_csvs = [
|
||||
os.path.join(get_object_type_path(sf_type), f)
|
||||
for f in os.listdir(get_object_type_path(sf_type))
|
||||
if f.endswith(".csv")
|
||||
]
|
||||
# If the csv already exists, return the path
|
||||
# This is likely due to a previous run that failed
|
||||
# after downloading the csv but before the data was
|
||||
# written to the db
|
||||
if existing_csvs:
|
||||
return existing_csvs
|
||||
return None
|
||||
|
||||
|
||||
def _build_bulk_query(sf_client: Salesforce, sf_type: str, time_filter: str) -> str:
|
||||
queryable_fields = _get_all_queryable_fields_of_sf_type(sf_client, sf_type)
|
||||
query = f"SELECT {', '.join(queryable_fields)} FROM {sf_type}{time_filter}"
|
||||
@@ -128,15 +133,16 @@ def _build_bulk_query(sf_client: Salesforce, sf_type: str, time_filter: str) ->
|
||||
|
||||
|
||||
def _bulk_retrieve_from_salesforce(
|
||||
sf_client: Salesforce, sf_type: str, time_filter: str, target_dir: str
|
||||
sf_client: Salesforce,
|
||||
sf_type: str,
|
||||
time_filter: str,
|
||||
) -> tuple[str, list[str] | None]:
|
||||
"""Returns a tuple of
|
||||
1. the salesforce object type
|
||||
2. the list of CSV's
|
||||
"""
|
||||
if not _check_if_object_type_is_empty(sf_client, sf_type, time_filter):
|
||||
return sf_type, None
|
||||
|
||||
if existing_csvs := _check_for_existing_csvs(sf_type):
|
||||
return sf_type, existing_csvs
|
||||
|
||||
query = _build_bulk_query(sf_client, sf_type, time_filter)
|
||||
|
||||
bulk_2_handler = SFBulk2Handler(
|
||||
@@ -153,33 +159,20 @@ def _bulk_retrieve_from_salesforce(
|
||||
)
|
||||
|
||||
logger.info(f"Downloading {sf_type}")
|
||||
|
||||
logger.debug(f"Query: {query}")
|
||||
logger.info(f"Query: {query}")
|
||||
|
||||
try:
|
||||
# This downloads the file to a file in the target path with a random name
|
||||
results = bulk_2_type.download(
|
||||
query=query,
|
||||
path=target_dir,
|
||||
path=get_object_type_path(sf_type),
|
||||
max_records=1000000,
|
||||
)
|
||||
|
||||
# prepend each downloaded csv with the object type (delimiter = '.')
|
||||
all_download_paths: list[str] = []
|
||||
for result in results:
|
||||
original_file_path = result["file"]
|
||||
directory, filename = os.path.split(original_file_path)
|
||||
new_filename = f"{sf_type}.{filename}"
|
||||
new_file_path = os.path.join(directory, new_filename)
|
||||
os.rename(original_file_path, new_file_path)
|
||||
all_download_paths.append(new_file_path)
|
||||
all_download_paths = [result["file"] for result in results]
|
||||
logger.info(f"Downloaded {sf_type} to {all_download_paths}")
|
||||
return sf_type, all_download_paths
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to download salesforce csv for object type {sf_type}: {e}"
|
||||
)
|
||||
logger.warning(f"Exceptioning query for object type {sf_type}: {query}")
|
||||
logger.info(f"Failed to download salesforce csv for object type {sf_type}: {e}")
|
||||
return sf_type, None
|
||||
|
||||
|
||||
@@ -188,35 +181,12 @@ def fetch_all_csvs_in_parallel(
|
||||
object_types: set[str],
|
||||
start: SecondsSinceUnixEpoch | None,
|
||||
end: SecondsSinceUnixEpoch | None,
|
||||
target_dir: str,
|
||||
) -> dict[str, list[str] | None]:
|
||||
"""
|
||||
Fetches all the csvs in parallel for the given object types
|
||||
Returns a dict of (sf_type, full_download_path)
|
||||
"""
|
||||
|
||||
# these types don't query properly and need looking at
|
||||
# problem_types: set[str] = {
|
||||
# "ContentDocumentLink",
|
||||
# "RecordActionHistory",
|
||||
# "PendingOrderSummary",
|
||||
# "UnifiedActivityRelation",
|
||||
# }
|
||||
|
||||
# these types don't have a LastModifiedDate field and instead use CreatedDate
|
||||
created_date_types: set[str] = {
|
||||
"AccountHistory",
|
||||
"AccountTag",
|
||||
"EntitySubscription",
|
||||
}
|
||||
|
||||
last_modified_time_filter = _build_last_modified_time_filter_for_salesforce(
|
||||
start, end
|
||||
)
|
||||
created_date_time_filter = _build_created_date_time_filter_for_salesforce(
|
||||
start, end
|
||||
)
|
||||
|
||||
time_filter = _build_time_filter_for_salesforce(start, end)
|
||||
time_filter_for_each_object_type = {}
|
||||
# We do this outside of the thread pool executor because this requires
|
||||
# a database connection and we don't want to block the thread pool
|
||||
@@ -225,11 +195,8 @@ def fetch_all_csvs_in_parallel(
|
||||
"""Only add time filter if there is at least one object of the type
|
||||
in the database. We aren't worried about partially completed object update runs
|
||||
because this occurs after we check for existing csvs which covers this case"""
|
||||
if has_at_least_one_object_of_type(target_dir, sf_type):
|
||||
if sf_type in created_date_types:
|
||||
time_filter_for_each_object_type[sf_type] = created_date_time_filter
|
||||
else:
|
||||
time_filter_for_each_object_type[sf_type] = last_modified_time_filter
|
||||
if has_at_least_one_object_of_type(sf_type):
|
||||
time_filter_for_each_object_type[sf_type] = time_filter
|
||||
else:
|
||||
time_filter_for_each_object_type[sf_type] = ""
|
||||
|
||||
@@ -240,7 +207,6 @@ def fetch_all_csvs_in_parallel(
|
||||
sf_client=sf_client,
|
||||
sf_type=object_type,
|
||||
time_filter=time_filter_for_each_object_type[object_type],
|
||||
target_dir=target_dir,
|
||||
),
|
||||
object_types,
|
||||
)
|
||||
|
||||
@@ -2,10 +2,8 @@ import csv
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from onyx.connectors.salesforce.utils import get_sqlite_db_path
|
||||
from onyx.connectors.salesforce.utils import SalesforceObject
|
||||
@@ -18,7 +16,6 @@ logger = setup_logger()
|
||||
|
||||
@contextmanager
|
||||
def get_db_connection(
|
||||
directory: str,
|
||||
isolation_level: str | None = None,
|
||||
) -> Iterator[sqlite3.Connection]:
|
||||
"""Get a database connection with proper isolation level and error handling.
|
||||
@@ -28,7 +25,7 @@ def get_db_connection(
|
||||
can be "IMMEDIATE" or "EXCLUSIVE" for more strict isolation.
|
||||
"""
|
||||
# 60 second timeout for locks
|
||||
conn = sqlite3.connect(get_sqlite_db_path(directory), timeout=60.0)
|
||||
conn = sqlite3.connect(get_sqlite_db_path(), timeout=60.0)
|
||||
|
||||
if isolation_level is not None:
|
||||
conn.isolation_level = isolation_level
|
||||
@@ -41,41 +38,17 @@ def get_db_connection(
|
||||
conn.close()
|
||||
|
||||
|
||||
def sqlite_log_stats(directory: str) -> None:
|
||||
with get_db_connection(directory, "EXCLUSIVE") as conn:
|
||||
cache_pages = conn.execute("PRAGMA cache_size").fetchone()[0]
|
||||
page_size = conn.execute("PRAGMA page_size").fetchone()[0]
|
||||
if cache_pages >= 0:
|
||||
cache_bytes = cache_pages * page_size
|
||||
else:
|
||||
cache_bytes = abs(cache_pages * 1024)
|
||||
logger.info(
|
||||
f"SQLite stats: sqlite_version={sqlite3.sqlite_version} "
|
||||
f"cache_pages={cache_pages} "
|
||||
f"page_size={page_size} "
|
||||
f"cache_bytes={cache_bytes}"
|
||||
)
|
||||
|
||||
|
||||
def init_db(directory: str) -> None:
|
||||
def init_db() -> None:
|
||||
"""Initialize the SQLite database with required tables if they don't exist."""
|
||||
# Create database directory if it doesn't exist
|
||||
start = time.monotonic()
|
||||
os.makedirs(os.path.dirname(get_sqlite_db_path()), exist_ok=True)
|
||||
|
||||
os.makedirs(os.path.dirname(get_sqlite_db_path(directory)), exist_ok=True)
|
||||
|
||||
with get_db_connection(directory, "EXCLUSIVE") as conn:
|
||||
with get_db_connection("EXCLUSIVE") as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
db_exists = os.path.exists(get_sqlite_db_path(directory))
|
||||
|
||||
if db_exists:
|
||||
file_path = Path(get_sqlite_db_path(directory))
|
||||
file_size = file_path.stat().st_size
|
||||
logger.info(f"init_db - found existing sqlite db: len={file_size}")
|
||||
else:
|
||||
# why is this only if the db doesn't exist?
|
||||
db_exists = os.path.exists(get_sqlite_db_path())
|
||||
|
||||
if not db_exists:
|
||||
# Enable WAL mode for better concurrent access and write performance
|
||||
cursor.execute("PRAGMA journal_mode=WAL")
|
||||
cursor.execute("PRAGMA synchronous=NORMAL")
|
||||
@@ -170,31 +143,16 @@ def init_db(directory: str) -> None:
|
||||
""",
|
||||
)
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
logger.info(f"init_db - create tables and indices: elapsed={elapsed:.2f}")
|
||||
|
||||
# Analyze tables to help query planner
|
||||
# NOTE(rkuo): skip ANALYZE - it takes too long and we likely don't have
|
||||
# complicated queries that need this
|
||||
# start = time.monotonic()
|
||||
# cursor.execute("ANALYZE relationships")
|
||||
# cursor.execute("ANALYZE salesforce_objects")
|
||||
# cursor.execute("ANALYZE relationship_types")
|
||||
# cursor.execute("ANALYZE user_email_map")
|
||||
# elapsed = time.monotonic() - start
|
||||
# logger.info(f"init_db - analyze: elapsed={elapsed:.2f}")
|
||||
cursor.execute("ANALYZE relationships")
|
||||
cursor.execute("ANALYZE salesforce_objects")
|
||||
cursor.execute("ANALYZE relationship_types")
|
||||
cursor.execute("ANALYZE user_email_map")
|
||||
|
||||
# If database already existed but user_email_map needs to be populated
|
||||
start = time.monotonic()
|
||||
cursor.execute("SELECT COUNT(*) FROM user_email_map")
|
||||
elapsed = time.monotonic() - start
|
||||
logger.info(f"init_db - count user_email_map: elapsed={elapsed:.2f}")
|
||||
|
||||
start = time.monotonic()
|
||||
if cursor.fetchone()[0] == 0:
|
||||
_update_user_email_map(conn)
|
||||
elapsed = time.monotonic() - start
|
||||
logger.info(f"init_db - update_user_email_map: elapsed={elapsed:.2f}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
@@ -282,15 +240,15 @@ def _update_user_email_map(conn: sqlite3.Connection) -> None:
|
||||
|
||||
|
||||
def update_sf_db_with_csv(
|
||||
directory: str,
|
||||
object_type: str,
|
||||
csv_download_path: str,
|
||||
delete_csv_after_use: bool = True,
|
||||
) -> list[str]:
|
||||
"""Update the SF DB with a CSV file using SQLite storage."""
|
||||
updated_ids = []
|
||||
|
||||
# Use IMMEDIATE to get a write lock at the start of the transaction
|
||||
with get_db_connection(directory, "IMMEDIATE") as conn:
|
||||
with get_db_connection("IMMEDIATE") as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
with open(csv_download_path, "r", newline="", encoding="utf-8") as f:
|
||||
@@ -337,12 +295,17 @@ def update_sf_db_with_csv(
|
||||
|
||||
conn.commit()
|
||||
|
||||
if delete_csv_after_use:
|
||||
# Remove the csv file after it has been used
|
||||
# to successfully update the db
|
||||
os.remove(csv_download_path)
|
||||
|
||||
return updated_ids
|
||||
|
||||
|
||||
def get_child_ids(directory: str, parent_id: str) -> set[str]:
|
||||
def get_child_ids(parent_id: str) -> set[str]:
|
||||
"""Get all child IDs for a given parent ID."""
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Force index usage with INDEXED BY
|
||||
@@ -354,9 +317,9 @@ def get_child_ids(directory: str, parent_id: str) -> set[str]:
|
||||
return child_ids
|
||||
|
||||
|
||||
def get_type_from_id(directory: str, object_id: str) -> str | None:
|
||||
def get_type_from_id(object_id: str) -> str | None:
|
||||
"""Get the type of an object from its ID."""
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT object_type FROM salesforce_objects WHERE id = ?", (object_id,)
|
||||
@@ -369,15 +332,15 @@ def get_type_from_id(directory: str, object_id: str) -> str | None:
|
||||
|
||||
|
||||
def get_record(
|
||||
directory: str, object_id: str, object_type: str | None = None
|
||||
object_id: str, object_type: str | None = None
|
||||
) -> SalesforceObject | None:
|
||||
"""Retrieve the record and return it as a SalesforceObject."""
|
||||
if object_type is None:
|
||||
object_type = get_type_from_id(directory, object_id)
|
||||
object_type = get_type_from_id(object_id)
|
||||
if not object_type:
|
||||
return None
|
||||
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT data FROM salesforce_objects WHERE id = ?", (object_id,))
|
||||
result = cursor.fetchone()
|
||||
@@ -389,9 +352,9 @@ def get_record(
|
||||
return SalesforceObject(id=object_id, type=object_type, data=data)
|
||||
|
||||
|
||||
def find_ids_by_type(directory: str, object_type: str) -> list[str]:
|
||||
def find_ids_by_type(object_type: str) -> list[str]:
|
||||
"""Find all object IDs for rows of the specified type."""
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT id FROM salesforce_objects WHERE object_type = ?", (object_type,)
|
||||
@@ -400,7 +363,6 @@ def find_ids_by_type(directory: str, object_type: str) -> list[str]:
|
||||
|
||||
|
||||
def get_affected_parent_ids_by_type(
|
||||
directory: str,
|
||||
updated_ids: list[str],
|
||||
parent_types: list[str],
|
||||
batch_size: int = 500,
|
||||
@@ -412,7 +374,7 @@ def get_affected_parent_ids_by_type(
|
||||
updated_ids_batches = batch_list(updated_ids, batch_size)
|
||||
updated_parent_ids: set[str] = set()
|
||||
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
for batch_ids in updated_ids_batches:
|
||||
@@ -457,7 +419,7 @@ def get_affected_parent_ids_by_type(
|
||||
yield parent_type, new_affected_ids
|
||||
|
||||
|
||||
def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
|
||||
def has_at_least_one_object_of_type(object_type: str) -> bool:
|
||||
"""Check if there is at least one object of the specified type in the database.
|
||||
|
||||
Args:
|
||||
@@ -466,7 +428,7 @@ def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
|
||||
Returns:
|
||||
bool: True if at least one object exists, False otherwise
|
||||
"""
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT COUNT(*) FROM salesforce_objects WHERE object_type = ?",
|
||||
@@ -481,7 +443,7 @@ def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
|
||||
NULL_ID_STRING = "N/A"
|
||||
|
||||
|
||||
def get_user_id_by_email(directory: str, email: str) -> str | None:
|
||||
def get_user_id_by_email(email: str) -> str | None:
|
||||
"""Get the Salesforce User ID for a given email address.
|
||||
|
||||
Args:
|
||||
@@ -492,7 +454,7 @@ def get_user_id_by_email(directory: str, email: str) -> str | None:
|
||||
- was_found: True if the email exists in the table, False if not found
|
||||
- user_id: The Salesforce User ID if exists, None otherwise
|
||||
"""
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT user_id FROM user_email_map WHERE email = ?", (email,))
|
||||
result = cursor.fetchone()
|
||||
@@ -501,10 +463,10 @@ def get_user_id_by_email(directory: str, email: str) -> str | None:
|
||||
return result[0]
|
||||
|
||||
|
||||
def update_email_to_id_table(directory: str, email: str, id: str | None) -> None:
|
||||
def update_email_to_id_table(email: str, id: str | None) -> None:
|
||||
"""Update the email to ID map table with a new email and ID."""
|
||||
id_to_use = id or NULL_ID_STRING
|
||||
with get_db_connection(directory) as conn:
|
||||
with get_db_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO user_email_map (email, user_id) VALUES (?, ?)",
|
||||
|
||||
@@ -30,9 +30,9 @@ class SalesforceObject:
|
||||
BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), "data")
|
||||
|
||||
|
||||
def get_sqlite_db_path(directory: str) -> str:
|
||||
def get_sqlite_db_path() -> str:
|
||||
"""Get the path to the sqlite db file."""
|
||||
return os.path.join(directory, "salesforce_db.sqlite")
|
||||
return os.path.join(BASE_DATA_PATH, "salesforce_db.sqlite")
|
||||
|
||||
|
||||
def get_object_type_path(object_type: str) -> str:
|
||||
|
||||
@@ -255,9 +255,7 @@ _DISALLOWED_MSG_SUBTYPES = {
|
||||
def default_msg_filter(message: MessageType) -> bool:
|
||||
# Don't keep messages from bots
|
||||
if message.get("bot_id") or message.get("app_id"):
|
||||
bot_profile_name = message.get("bot_profile", {}).get("name")
|
||||
print(f"bot_profile_name: {bot_profile_name}")
|
||||
if bot_profile_name == "DanswerBot Testing":
|
||||
if message.get("bot_profile", {}).get("name") == "OnyxConnector":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@@ -227,13 +227,16 @@ class SearchPipeline:
|
||||
|
||||
# If ee is enabled, censor the chunk sections based on user access
|
||||
# Otherwise, return the retrieved chunks
|
||||
censored_chunks: list[InferenceChunk] = fetch_ee_implementation_or_noop(
|
||||
"onyx.external_permissions.post_query_censoring",
|
||||
"_post_query_chunk_censoring",
|
||||
retrieved_chunks,
|
||||
)(
|
||||
chunks=retrieved_chunks,
|
||||
user=self.user,
|
||||
censored_chunks = cast(
|
||||
list[InferenceChunk],
|
||||
fetch_ee_implementation_or_noop(
|
||||
"onyx.external_permissions.post_query_censoring",
|
||||
"_post_query_chunk_censoring",
|
||||
retrieved_chunks,
|
||||
)(
|
||||
chunks=retrieved_chunks,
|
||||
user=self.user,
|
||||
),
|
||||
)
|
||||
|
||||
above = self.search_query.chunks_above
|
||||
|
||||
@@ -2,7 +2,6 @@ import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import zipfile
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
@@ -15,7 +14,6 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
from typing import NamedTuple
|
||||
from typing import Optional
|
||||
|
||||
import chardet
|
||||
import docx # type: ignore
|
||||
@@ -570,8 +568,8 @@ def extract_text_and_images(
|
||||
|
||||
|
||||
def convert_docx_to_txt(
|
||||
file: UploadFile, file_store: FileStore, file_path: Optional[str] = None
|
||||
) -> str:
|
||||
file: UploadFile, file_store: FileStore, file_path: str
|
||||
) -> None:
|
||||
"""
|
||||
Helper to convert docx to a .txt file in the same filestore.
|
||||
"""
|
||||
@@ -583,41 +581,15 @@ def convert_docx_to_txt(
|
||||
all_paras = [p.text for p in doc.paragraphs]
|
||||
text_content = "\n".join(all_paras)
|
||||
|
||||
file_name = file.filename or f"docx_{uuid.uuid4()}"
|
||||
text_file_name = docx_to_txt_filename(file_path if file_path else file_name)
|
||||
txt_file_path = docx_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
file_name=text_file_name,
|
||||
file_name=txt_file_path,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
return text_file_name
|
||||
|
||||
|
||||
def docx_to_txt_filename(file_path: str) -> str:
|
||||
return file_path.rsplit(".", 1)[0] + ".txt"
|
||||
|
||||
|
||||
def convert_pdf_to_txt(file: UploadFile, file_store: FileStore, file_path: str) -> str:
|
||||
"""
|
||||
Helper to convert PDF to a .txt file in the same filestore.
|
||||
"""
|
||||
file.file.seek(0)
|
||||
|
||||
# Extract text from the PDF
|
||||
text_content, _, _ = read_pdf_file(file.file)
|
||||
|
||||
text_file_name = pdf_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
file_name=text_file_name,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
return text_file_name
|
||||
|
||||
|
||||
def pdf_to_txt_filename(file_path: str) -> str:
|
||||
return file_path.rsplit(".", 1)[0] + ".txt"
|
||||
|
||||
@@ -459,6 +459,10 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
llm = get_default_llm_with_vision()
|
||||
|
||||
if not llm:
|
||||
logger.warning(
|
||||
"No vision-capable LLM available. Image sections will not be processed."
|
||||
)
|
||||
|
||||
# Even without LLM, we still convert to IndexingDocument with base Sections
|
||||
return [
|
||||
IndexingDocument(
|
||||
@@ -925,12 +929,10 @@ def index_doc_batch(
|
||||
for chunk_num, chunk in enumerate(chunks_with_embeddings)
|
||||
]
|
||||
|
||||
short_descriptor_list = [
|
||||
chunk.to_short_descriptor() for chunk in access_aware_chunks
|
||||
]
|
||||
short_descriptor_log = str(short_descriptor_list)[:1024]
|
||||
logger.debug(f"Indexing the following chunks: {short_descriptor_log}")
|
||||
|
||||
logger.debug(
|
||||
"Indexing the following chunks: "
|
||||
f"{[chunk.to_short_descriptor() for chunk in access_aware_chunks]}"
|
||||
)
|
||||
# A document will not be spread across different batches, so all the
|
||||
# documents with chunks in this set, are fully represented by the chunks
|
||||
# in this set
|
||||
|
||||
@@ -175,7 +175,7 @@ class EmbeddingModel:
|
||||
embeddings: list[Embedding] = []
|
||||
|
||||
def process_batch(
|
||||
batch_idx: int, batch_len: int, text_batch: list[str]
|
||||
batch_idx: int, text_batch: list[str]
|
||||
) -> tuple[int, list[Embedding]]:
|
||||
if self.callback:
|
||||
if self.callback.should_stop():
|
||||
@@ -202,8 +202,8 @@ class EmbeddingModel:
|
||||
end_time = time.time()
|
||||
|
||||
processing_time = end_time - start_time
|
||||
logger.debug(
|
||||
f"EmbeddingModel.process_batch: Batch {batch_idx}/{batch_len} processing time: {processing_time:.2f} seconds"
|
||||
logger.info(
|
||||
f"Batch {batch_idx} processing time: {processing_time:.2f} seconds"
|
||||
)
|
||||
|
||||
return batch_idx, response.embeddings
|
||||
@@ -215,7 +215,7 @@ class EmbeddingModel:
|
||||
if num_threads >= 1 and self.provider_type and len(text_batches) > 1:
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
future_to_batch = {
|
||||
executor.submit(process_batch, idx, len(text_batches), batch): idx
|
||||
executor.submit(process_batch, idx, batch): idx
|
||||
for idx, batch in enumerate(text_batches, start=1)
|
||||
}
|
||||
|
||||
@@ -238,7 +238,7 @@ class EmbeddingModel:
|
||||
else:
|
||||
# Original sequential processing
|
||||
for idx, text_batch in enumerate(text_batches, start=1):
|
||||
_, batch_embeddings = process_batch(idx, len(text_batches), text_batch)
|
||||
_, batch_embeddings = process_batch(idx, text_batch)
|
||||
embeddings.extend(batch_embeddings)
|
||||
if self.callback:
|
||||
self.callback.progress("_batch_encode_texts", 1)
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
# Standards
|
||||
SEPARATOR_LINE = "-------"
|
||||
SEPARATOR_LINE_LONG = "---------------"
|
||||
NO_EXTRACTION = "No extraction of knowledge graph objects was feasable."
|
||||
YES = "yes"
|
||||
NO = "no"
|
||||
DC_OBJECT_SEPARATOR = ";"
|
||||
|
||||
|
||||
DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT = f"""
|
||||
You are an expert in finding relevant objects/objext specifications of the same type in a list of documents. \
|
||||
In this case you are interested \
|
||||
in generating: {{objects_of_interest}}.
|
||||
You should look at the documents - in no particular order! - and extract each object you find in the documents.
|
||||
{SEPARATOR_LINE}
|
||||
Here are the documents you are supposed to search through:
|
||||
--
|
||||
{{document_text}}
|
||||
{SEPARATOR_LINE}
|
||||
Here are the task instructions you should use to help you find the desired objects:
|
||||
{SEPARATOR_LINE}
|
||||
{{task}}
|
||||
{SEPARATOR_LINE}
|
||||
Here is the question that may provide critical additional context for the task:
|
||||
{SEPARATOR_LINE}
|
||||
{{question}}
|
||||
{SEPARATOR_LINE}
|
||||
Please answer the question in the following format:
|
||||
REASONING: <your reasoning for the classification> - OBJECTS: <the objects - just their names - that you found, \
|
||||
separated by ';'>
|
||||
""".strip()
|
||||
|
||||
|
||||
DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT = f"""
|
||||
You are an expert in finding relevant objects/object specifications of the same type in a list of documents. \
|
||||
In this case you are interested \
|
||||
in generating: {{objects_of_interest}}.
|
||||
You should look at the provided data - in no particular order! - and extract each object you find in the documents.
|
||||
{SEPARATOR_LINE}
|
||||
Here are the data provided by the user:
|
||||
--
|
||||
{{base_data}}
|
||||
{SEPARATOR_LINE}
|
||||
Here are the task instructions you should use to help you find the desired objects:
|
||||
{SEPARATOR_LINE}
|
||||
{{task}}
|
||||
{SEPARATOR_LINE}
|
||||
Here is the request that may provide critical additional context for the task:
|
||||
{SEPARATOR_LINE}
|
||||
{{question}}
|
||||
{SEPARATOR_LINE}
|
||||
Please address the request in the following format:
|
||||
REASONING: <your reasoning for the classification> - OBJECTS: <the objects - just their names - that you found, \
|
||||
separated by ';'>
|
||||
""".strip()
|
||||
|
||||
|
||||
DC_OBJECT_SOURCE_RESEARCH_PROMPT = f"""
|
||||
Today is {{today}}. You are an expert in extracting relevant structured information from a list of documents that \
|
||||
should relate to one object. (Try to make sure that you know it relates to that one object!).
|
||||
You should look at the documents - in no particular order! - and extract the information asked for this task:
|
||||
{SEPARATOR_LINE}
|
||||
{{task}}
|
||||
{SEPARATOR_LINE}
|
||||
|
||||
Here is the user question that may provide critical additional context for the task:
|
||||
{SEPARATOR_LINE}
|
||||
{{question}}
|
||||
{SEPARATOR_LINE}
|
||||
|
||||
Here are the documents you are supposed to search through:
|
||||
--
|
||||
{{document_text}}
|
||||
{SEPARATOR_LINE}
|
||||
Note: please cite your sources inline as you generate the results! Use the format [1], etc. Infer the \
|
||||
number from the provided context documents. This is very important!
|
||||
Please address the task in the following format:
|
||||
REASONING:
|
||||
-- <your reasoning for the classification>
|
||||
RESEARCH RESULTS:
|
||||
{{format}}
|
||||
""".strip()
|
||||
|
||||
|
||||
DC_OBJECT_CONSOLIDATION_PROMPT = f"""
|
||||
You are a helpful assistant that consolidates information about a specific object \
|
||||
from multiple sources.
|
||||
The object is:
|
||||
{SEPARATOR_LINE}
|
||||
{{object}}
|
||||
{SEPARATOR_LINE}
|
||||
and the information is
|
||||
{SEPARATOR_LINE}
|
||||
{{information}}
|
||||
{SEPARATOR_LINE}
|
||||
Here is the user question that may provide critical additional context for the task:
|
||||
{SEPARATOR_LINE}
|
||||
{{question}}
|
||||
{SEPARATOR_LINE}
|
||||
|
||||
Please consolidate the information into a single, concise answer. The consolidated informtation \
|
||||
for the object should be in the following format:
|
||||
{SEPARATOR_LINE}
|
||||
{{format}}
|
||||
{SEPARATOR_LINE}
|
||||
Overall, please use this structure to communicate the consolidated information:
|
||||
{SEPARATOR_LINE}
|
||||
REASONING: <your reasoning for consolidating the information>
|
||||
INFORMATION:
|
||||
<consolidated information in the proper format that you have created>
|
||||
"""
|
||||
|
||||
|
||||
DC_FORMATTING_NO_BASE_DATA_PROMPT = f"""
|
||||
You are an expert in text formatting. Your task is to take a given text and convert it 100 percent accurately \
|
||||
in a new format.
|
||||
Here is the text you are supposed to format:
|
||||
{SEPARATOR_LINE}
|
||||
{{text}}
|
||||
{SEPARATOR_LINE}
|
||||
Here is the format you are supposed to use:
|
||||
{SEPARATOR_LINE}
|
||||
{{format}}
|
||||
{SEPARATOR_LINE}
|
||||
Please start the generation directly with the formatted text. (Note that the output should not be code, but text.)
|
||||
"""
|
||||
|
||||
DC_FORMATTING_WITH_BASE_DATA_PROMPT = f"""
|
||||
You are an expert in text formatting. Your task is to take a given text and the initial \
|
||||
base data provided by the user, and convert it 100 percent accurately \
|
||||
in a new format. The base data may also contain important relationships that are critical \
|
||||
for the formatting.
|
||||
Here is the initial data provided by the user:
|
||||
{SEPARATOR_LINE}
|
||||
{{base_data}}
|
||||
{SEPARATOR_LINE}
|
||||
Here is the text you are supposed combine (and format) with the initial data, adhering to the \
|
||||
format instructions provided by later in the prompt:
|
||||
{SEPARATOR_LINE}
|
||||
{{text}}
|
||||
{SEPARATOR_LINE}
|
||||
And here are the format instructions you are supposed to use:
|
||||
{SEPARATOR_LINE}
|
||||
{{format}}
|
||||
{SEPARATOR_LINE}
|
||||
Please start the generation directly with the formatted text. (Note that the output should not be code, but text.)
|
||||
"""
|
||||
@@ -100,7 +100,6 @@ from onyx.db.models import UserGroup__ConnectorCredentialPair
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.file_processing.extract_file_text import convert_docx_to_txt
|
||||
from onyx.file_processing.extract_file_text import convert_pdf_to_txt
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.key_value_store.interface import KvKeyNotFoundError
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
@@ -129,7 +128,6 @@ from onyx.utils.telemetry import create_milestone_and_report
|
||||
from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
||||
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
_GMAIL_CREDENTIAL_ID_COOKIE_NAME = "gmail_credential_id"
|
||||
@@ -432,23 +430,6 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
|
||||
)
|
||||
continue
|
||||
|
||||
# Special handling for docx files - only store the plaintext version
|
||||
if file.content_type and file.content_type.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
|
||||
text_file_path = convert_docx_to_txt(file, file_store)
|
||||
deduped_file_paths.append(text_file_path)
|
||||
continue
|
||||
|
||||
# Special handling for PDF files - only store the plaintext version
|
||||
if file.content_type and file.content_type.startswith("application/pdf"):
|
||||
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
|
||||
text_file_path = convert_pdf_to_txt(file, file_store, file_path)
|
||||
deduped_file_paths.append(text_file_path)
|
||||
continue
|
||||
|
||||
# Default handling for all other file types
|
||||
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
|
||||
deduped_file_paths.append(file_path)
|
||||
file_store.save_file(
|
||||
@@ -459,6 +440,11 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
|
||||
file_type=file.content_type or "text/plain",
|
||||
)
|
||||
|
||||
if file.content_type and file.content_type.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
convert_docx_to_txt(file, file_store, file_path)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
return FileUploadResponse(file_paths=deduped_file_paths)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
@@ -7,7 +6,6 @@ from pydantic import BaseModel
|
||||
from pydantic import model_validator
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.context.search.enums import SearchType
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceSection
|
||||
@@ -77,8 +75,6 @@ class SearchToolOverrideKwargs(BaseModel):
|
||||
ordering_only: bool | None = (
|
||||
None # Flag for fast path when search is only needed for ordering
|
||||
)
|
||||
document_sources: list[DocumentSource] | None = None
|
||||
time_cutoff: datetime | None = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@@ -292,8 +292,6 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
user_file_ids = None
|
||||
user_folder_ids = None
|
||||
ordering_only = False
|
||||
document_sources = None
|
||||
time_cutoff = None
|
||||
if override_kwargs:
|
||||
force_no_rerank = use_alt_not_None(override_kwargs.force_no_rerank, False)
|
||||
alternate_db_session = override_kwargs.alternate_db_session
|
||||
@@ -304,8 +302,6 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
user_file_ids = override_kwargs.user_file_ids
|
||||
user_folder_ids = override_kwargs.user_folder_ids
|
||||
ordering_only = use_alt_not_None(override_kwargs.ordering_only, False)
|
||||
document_sources = override_kwargs.document_sources
|
||||
time_cutoff = override_kwargs.time_cutoff
|
||||
|
||||
# Fast path for ordering-only search
|
||||
if ordering_only:
|
||||
@@ -338,23 +334,6 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
)
|
||||
retrieval_options = RetrievalDetails(filters=filters)
|
||||
|
||||
if document_sources or time_cutoff:
|
||||
# Get retrieval_options and filters, or create if they don't exist
|
||||
retrieval_options = retrieval_options or RetrievalDetails()
|
||||
retrieval_options.filters = retrieval_options.filters or BaseFilters()
|
||||
|
||||
# Handle document sources
|
||||
if document_sources:
|
||||
source_types = retrieval_options.filters.source_type or []
|
||||
retrieval_options.filters.source_type = list(
|
||||
set(source_types + document_sources)
|
||||
)
|
||||
|
||||
# Handle time cutoff
|
||||
if time_cutoff:
|
||||
# Overwrite time-cutoff should supercede existing time-cutoff, even if defined
|
||||
retrieval_options.filters.time_cutoff = time_cutoff
|
||||
|
||||
search_pipeline = SearchPipeline(
|
||||
search_request=SearchRequest(
|
||||
query=query,
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.gong.connector import GongConnector
|
||||
from onyx.connectors.models import Document
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gong_connector() -> GongConnector:
|
||||
connector = GongConnector()
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"gong_access_key": os.environ["GONG_ACCESS_KEY"],
|
||||
"gong_access_key_secret": os.environ["GONG_ACCESS_KEY_SECRET"],
|
||||
}
|
||||
)
|
||||
|
||||
return connector
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_gong_basic(mock_get_api_key: MagicMock, gong_connector: GongConnector) -> None:
|
||||
doc_batch_generator = gong_connector.poll_source(0, time.time())
|
||||
|
||||
doc_batch = next(doc_batch_generator)
|
||||
with pytest.raises(StopIteration):
|
||||
next(doc_batch_generator)
|
||||
|
||||
assert len(doc_batch) == 2
|
||||
|
||||
docs: list[Document] = []
|
||||
for doc in doc_batch:
|
||||
docs.append(doc)
|
||||
|
||||
assert docs[0].semantic_identifier == "test with chris"
|
||||
assert docs[1].semantic_identifier == "Testing Gong"
|
||||
@@ -1,7 +1,6 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
@@ -106,54 +105,6 @@ def test_highspot_connector_slim(
|
||||
assert len(all_slim_doc_ids) > 0
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_highspot_connector_poll_source(
|
||||
mock_get_api_key: MagicMock, highspot_connector: HighspotConnector
|
||||
) -> None:
|
||||
"""Test poll_source functionality with date range filtering."""
|
||||
# Define date range: April 3, 2025 to April 4, 2025
|
||||
start_date = datetime(2025, 4, 3, 0, 0, 0)
|
||||
end_date = datetime(2025, 4, 4, 23, 59, 59)
|
||||
|
||||
# Convert to seconds since Unix epoch
|
||||
start_time = int(time.mktime(start_date.timetuple()))
|
||||
end_time = int(time.mktime(end_date.timetuple()))
|
||||
|
||||
# Load test data for assertions
|
||||
test_data = load_test_data()
|
||||
poll_source_data = test_data.get("poll_source", {})
|
||||
target_doc_id = poll_source_data.get("target_doc_id")
|
||||
|
||||
# Call poll_source with date range
|
||||
all_docs: list[Document] = []
|
||||
target_doc: Document | None = None
|
||||
|
||||
for doc_batch in highspot_connector.poll_source(start_time, end_time):
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
if doc.id == f"HIGHSPOT_{target_doc_id}":
|
||||
target_doc = doc
|
||||
|
||||
# Verify documents were loaded
|
||||
assert len(all_docs) > 0
|
||||
|
||||
# Verify the specific test document was found and has correct properties
|
||||
assert target_doc is not None
|
||||
assert target_doc.semantic_identifier == poll_source_data.get("semantic_identifier")
|
||||
assert target_doc.source == DocumentSource.HIGHSPOT
|
||||
assert target_doc.metadata is not None
|
||||
|
||||
# Verify sections
|
||||
assert len(target_doc.sections) == 1
|
||||
section = target_doc.sections[0]
|
||||
assert section.link == poll_source_data.get("link")
|
||||
assert section.text is not None
|
||||
assert len(section.text) > 0
|
||||
|
||||
|
||||
def test_highspot_connector_validate_credentials(
|
||||
highspot_connector: HighspotConnector,
|
||||
) -> None:
|
||||
|
||||
@@ -1,10 +1,5 @@
|
||||
{
|
||||
"target_doc_id": "67cd8eb35d3ee0487de2e704",
|
||||
"semantic_identifier": "Highspot in Action _ Salesforce Integration",
|
||||
"link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704",
|
||||
"poll_source": {
|
||||
"target_doc_id":"67ef9edcc3f40b2bf3d816a8",
|
||||
"semantic_identifier":"A Brief Introduction To AI",
|
||||
"link":"https://www.highspot.com/items/67ef9edcc3f40b2bf3d816a8"
|
||||
}
|
||||
"link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704"
|
||||
}
|
||||
|
||||
@@ -35,22 +35,23 @@ def salesforce_connector() -> SalesforceConnector:
|
||||
connector = SalesforceConnector(
|
||||
requested_objects=["Account", "Contact", "Opportunity"],
|
||||
)
|
||||
|
||||
username = os.environ["SF_USERNAME"]
|
||||
password = os.environ["SF_PASSWORD"]
|
||||
security_token = os.environ["SF_SECURITY_TOKEN"]
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"sf_username": username,
|
||||
"sf_password": password,
|
||||
"sf_security_token": security_token,
|
||||
"sf_username": os.environ["SF_USERNAME"],
|
||||
"sf_password": os.environ["SF_PASSWORD"],
|
||||
"sf_security_token": os.environ["SF_SECURITY_TOKEN"],
|
||||
}
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
# TODO: make the credentials not expire
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Credentials change over time, so this test will fail if run when "
|
||||
"the credentials expire."
|
||||
)
|
||||
)
|
||||
def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -> None:
|
||||
test_data = load_test_data()
|
||||
target_test_doc: Document | None = None
|
||||
@@ -60,26 +61,21 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
|
||||
all_docs.append(doc)
|
||||
if doc.id == test_data["id"]:
|
||||
target_test_doc = doc
|
||||
break
|
||||
|
||||
# The number of docs here seems to change actively so do a very loose check
|
||||
# as of 2025-03-28 it was around 32472
|
||||
assert len(all_docs) > 32000
|
||||
assert len(all_docs) < 40000
|
||||
|
||||
assert len(all_docs) == 6
|
||||
assert target_test_doc is not None
|
||||
|
||||
# Set of received links
|
||||
received_links: set[str] = set()
|
||||
# List of received text fields, which contain key-value pairs seperated by newlines
|
||||
received_text: list[str] = []
|
||||
recieved_text: list[str] = []
|
||||
|
||||
# Iterate over the sections of the target test doc to extract the links and text
|
||||
for section in target_test_doc.sections:
|
||||
assert section.link
|
||||
assert section.text
|
||||
received_links.add(section.link)
|
||||
received_text.append(section.text)
|
||||
recieved_text.append(section.text)
|
||||
|
||||
# Check that the received links match the expected links from the test data json
|
||||
expected_links = set(test_data["expected_links"])
|
||||
@@ -89,9 +85,8 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
|
||||
expected_text = test_data["expected_text"]
|
||||
if not isinstance(expected_text, list):
|
||||
raise ValueError("Expected text is not a list")
|
||||
|
||||
unparsed_expected_key_value_pairs: list[str] = expected_text
|
||||
received_key_value_pairs = extract_key_value_pairs_to_set(received_text)
|
||||
received_key_value_pairs = extract_key_value_pairs_to_set(recieved_text)
|
||||
expected_key_value_pairs = extract_key_value_pairs_to_set(
|
||||
unparsed_expected_key_value_pairs
|
||||
)
|
||||
@@ -101,21 +96,13 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
|
||||
assert target_test_doc.source == DocumentSource.SALESFORCE
|
||||
assert target_test_doc.semantic_identifier == test_data["semantic_identifier"]
|
||||
assert target_test_doc.metadata == test_data["metadata"]
|
||||
|
||||
assert target_test_doc.primary_owners is not None
|
||||
primary_owner = target_test_doc.primary_owners[0]
|
||||
expected_primary_owner = test_data["primary_owners"]
|
||||
assert isinstance(expected_primary_owner, dict)
|
||||
assert primary_owner.email == expected_primary_owner["email"]
|
||||
assert primary_owner.first_name == expected_primary_owner["first_name"]
|
||||
assert primary_owner.last_name == expected_primary_owner["last_name"]
|
||||
|
||||
assert target_test_doc.primary_owners == test_data["primary_owners"]
|
||||
assert target_test_doc.secondary_owners == test_data["secondary_owners"]
|
||||
assert target_test_doc.title == test_data["title"]
|
||||
|
||||
|
||||
# TODO: make the credentials not expire
|
||||
@pytest.mark.skip(
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Credentials change over time, so this test will fail if run when "
|
||||
"the credentials expire."
|
||||
|
||||
@@ -1,162 +1,20 @@
|
||||
{
|
||||
"id": "SALESFORCE_001bm00000eu6n5AAA",
|
||||
"id": "SALESFORCE_001fI000005drUcQAI",
|
||||
"expected_links": [
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpEeAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqd3AAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoKiAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvDSAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrmHAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrl2AAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvejAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStlvAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpPfAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrP9AAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvlMAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESt3JAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoBkAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStw2AAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrkMAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESojKAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuLEAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoSIAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESu2YAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvgSAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESurnAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrnqAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoB5AAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuJuAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrfyAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/001bm00000eu6n5AAA",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpUHAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsgGAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESr7UAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESu1BAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpqzAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESplZAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvJ3AAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESurKAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStSiAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuJFAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESu8xAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqfzAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqsrAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStoZAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsIUAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsAGAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESv8GAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrOKAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoUmAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESudKAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuJ8AAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvf2AAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESw3qAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESugRAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESr18AAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqV1AAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuLVAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpjoAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqULAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuCAAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrfpAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESp5YAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrMNAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStaUAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESt5LAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrtcAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESomaAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrtIAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoToAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuWLAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrWvAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsJEAA1",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsxwAAD",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvUgAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvWjAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStBuAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpZiAAL",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuhYAAT",
|
||||
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuWAAA1"
|
||||
"https://customization-ruby-2195.my.salesforce.com/001fI000005drUcQAI",
|
||||
"https://customization-ruby-2195.my.salesforce.com/003fI000001jiCPQAY",
|
||||
"https://customization-ruby-2195.my.salesforce.com/017fI00000T7hvsQAB",
|
||||
"https://customization-ruby-2195.my.salesforce.com/006fI000000rDvBQAU"
|
||||
],
|
||||
"expected_text": [
|
||||
"IsDeleted: false\nBillingCity: Shaykh al \u00e1\u00b8\u00a8ad\u00c4\u00abd\nName: Voonder\nCleanStatus: Pending\nBillingStreet: 12 Cambridge Parkway",
|
||||
"Email: eslayqzs@icio.us\nIsDeleted: false\nLastName: Slay\nIsEmailBounced: false\nFirstName: Ebeneser\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ptweedgdh@umich.edu\nIsDeleted: false\nLastName: Tweed\nIsEmailBounced: false\nFirstName: Paulita\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ehurnellnlx@facebook.com\nIsDeleted: false\nLastName: Hurnell\nIsEmailBounced: false\nFirstName: Eliot\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ccarik4q4@google.it\nIsDeleted: false\nLastName: Carik\nIsEmailBounced: false\nFirstName: Chadwick\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: cvannozziina6@moonfruit.com\nIsDeleted: false\nLastName: Vannozzii\nIsEmailBounced: false\nFirstName: Christophorus\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: mikringill2kz@hugedomains.com\nIsDeleted: false\nLastName: Ikringill\nIsEmailBounced: false\nFirstName: Meghann\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: bgrinvalray@fda.gov\nIsDeleted: false\nLastName: Grinval\nIsEmailBounced: false\nFirstName: Berti\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: aollanderhr7@cam.ac.uk\nIsDeleted: false\nLastName: Ollander\nIsEmailBounced: false\nFirstName: Annemarie\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: rwhitesideq38@gravatar.com\nIsDeleted: false\nLastName: Whiteside\nIsEmailBounced: false\nFirstName: Rolando\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: vkrafthmz@techcrunch.com\nIsDeleted: false\nLastName: Kraft\nIsEmailBounced: false\nFirstName: Vidovik\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: jhillaut@4shared.com\nIsDeleted: false\nLastName: Hill\nIsEmailBounced: false\nFirstName: Janel\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: lralstonycs@discovery.com\nIsDeleted: false\nLastName: Ralston\nIsEmailBounced: false\nFirstName: Lorrayne\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: blyttlewba@networkadvertising.org\nIsDeleted: false\nLastName: Lyttle\nIsEmailBounced: false\nFirstName: Ban\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: pplummernvf@technorati.com\nIsDeleted: false\nLastName: Plummer\nIsEmailBounced: false\nFirstName: Pete\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: babrahamoffxpb@theatlantic.com\nIsDeleted: false\nLastName: Abrahamoff\nIsEmailBounced: false\nFirstName: Brander\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ahargieym0@homestead.com\nIsDeleted: false\nLastName: Hargie\nIsEmailBounced: false\nFirstName: Aili\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: hstotthp2@yelp.com\nIsDeleted: false\nLastName: Stott\nIsEmailBounced: false\nFirstName: Hartley\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: jganniclifftuvj@blinklist.com\nIsDeleted: false\nLastName: Ganniclifft\nIsEmailBounced: false\nFirstName: Jamima\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ldodelly8q@ed.gov\nIsDeleted: false\nLastName: Dodell\nIsEmailBounced: false\nFirstName: Lynde\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: rmilner3cp@smh.com.au\nIsDeleted: false\nLastName: Milner\nIsEmailBounced: false\nFirstName: Ralph\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: gghiriardellic19@state.tx.us\nIsDeleted: false\nLastName: Ghiriardelli\nIsEmailBounced: false\nFirstName: Garv\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: rhubatschfpu@nature.com\nIsDeleted: false\nLastName: Hubatsch\nIsEmailBounced: false\nFirstName: Rose\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: mtrenholme1ws@quantcast.com\nIsDeleted: false\nLastName: Trenholme\nIsEmailBounced: false\nFirstName: Mariejeanne\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: jmussettpbd@over-blog.com\nIsDeleted: false\nLastName: Mussett\nIsEmailBounced: false\nFirstName: Juliann\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: bgoroni145@illinois.edu\nIsDeleted: false\nLastName: Goroni\nIsEmailBounced: false\nFirstName: Bernarr\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: afalls3ph@theguardian.com\nIsDeleted: false\nLastName: Falls\nIsEmailBounced: false\nFirstName: Angelia\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: lswettjoi@go.com\nIsDeleted: false\nLastName: Swett\nIsEmailBounced: false\nFirstName: Levon\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: emullinsz38@dailymotion.com\nIsDeleted: false\nLastName: Mullins\nIsEmailBounced: false\nFirstName: Elsa\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ibernettehco@ebay.co.uk\nIsDeleted: false\nLastName: Bernette\nIsEmailBounced: false\nFirstName: Ingrid\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: trisleybtt@simplemachines.org\nIsDeleted: false\nLastName: Risley\nIsEmailBounced: false\nFirstName: Toma\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: rgypsonqx1@goodreads.com\nIsDeleted: false\nLastName: Gypson\nIsEmailBounced: false\nFirstName: Reed\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: cposvneri28@jiathis.com\nIsDeleted: false\nLastName: Posvner\nIsEmailBounced: false\nFirstName: Culley\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: awilmut2rz@geocities.jp\nIsDeleted: false\nLastName: Wilmut\nIsEmailBounced: false\nFirstName: Andy\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: aluckwellra5@exblog.jp\nIsDeleted: false\nLastName: Luckwell\nIsEmailBounced: false\nFirstName: Andreana\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: irollings26j@timesonline.co.uk\nIsDeleted: false\nLastName: Rollings\nIsEmailBounced: false\nFirstName: Ibrahim\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: gspireqpd@g.co\nIsDeleted: false\nLastName: Spire\nIsEmailBounced: false\nFirstName: Gaelan\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: sbezleyk2y@acquirethisname.com\nIsDeleted: false\nLastName: Bezley\nIsEmailBounced: false\nFirstName: Sindee\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: icollerrr@flickr.com\nIsDeleted: false\nLastName: Coller\nIsEmailBounced: false\nFirstName: Inesita\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: kfolliott1bo@nature.com\nIsDeleted: false\nLastName: Folliott\nIsEmailBounced: false\nFirstName: Kennan\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: kroofjfo@gnu.org\nIsDeleted: false\nLastName: Roof\nIsEmailBounced: false\nFirstName: Karlik\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: lcovotti8s4@rediff.com\nIsDeleted: false\nLastName: Covotti\nIsEmailBounced: false\nFirstName: Lucho\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: gpatriskson1rs@census.gov\nIsDeleted: false\nLastName: Patriskson\nIsEmailBounced: false\nFirstName: Gardener\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: spidgleyqvw@usgs.gov\nIsDeleted: false\nLastName: Pidgley\nIsEmailBounced: false\nFirstName: Simona\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: cbecarrak0i@over-blog.com\nIsDeleted: false\nLastName: Becarra\nIsEmailBounced: false\nFirstName: Cally\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: aparkman9td@bbc.co.uk\nIsDeleted: false\nLastName: Parkman\nIsEmailBounced: false\nFirstName: Agneta\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: bboddingtonhn@quantcast.com\nIsDeleted: false\nLastName: Boddington\nIsEmailBounced: false\nFirstName: Betta\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: dcasementx0p@cafepress.com\nIsDeleted: false\nLastName: Casement\nIsEmailBounced: false\nFirstName: Dannie\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: hzornbhe@latimes.com\nIsDeleted: false\nLastName: Zorn\nIsEmailBounced: false\nFirstName: Haleigh\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: cfifieldbjb@blogspot.com\nIsDeleted: false\nLastName: Fifield\nIsEmailBounced: false\nFirstName: Christalle\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ddewerson4t3@skype.com\nIsDeleted: false\nLastName: Dewerson\nIsEmailBounced: false\nFirstName: Dyann\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: khullock52p@sohu.com\nIsDeleted: false\nLastName: Hullock\nIsEmailBounced: false\nFirstName: Kellina\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: tfremantle32n@bandcamp.com\nIsDeleted: false\nLastName: Fremantle\nIsEmailBounced: false\nFirstName: Turner\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: sbernardtylp@nps.gov\nIsDeleted: false\nLastName: Bernardt\nIsEmailBounced: false\nFirstName: Selina\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: smcgettigan8kk@slideshare.net\nIsDeleted: false\nLastName: McGettigan\nIsEmailBounced: false\nFirstName: Sada\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: wdelafontvgn@businesswire.com\nIsDeleted: false\nLastName: Delafont\nIsEmailBounced: false\nFirstName: West\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: lbelsher9ne@indiatimes.com\nIsDeleted: false\nLastName: Belsher\nIsEmailBounced: false\nFirstName: Lou\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: cgoody27y@blogtalkradio.com\nIsDeleted: false\nLastName: Goody\nIsEmailBounced: false\nFirstName: Colene\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: cstodejzz@ucoz.ru\nIsDeleted: false\nLastName: Stode\nIsEmailBounced: false\nFirstName: Curcio\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: abromidgejb@china.com.cn\nIsDeleted: false\nLastName: Bromidge\nIsEmailBounced: false\nFirstName: Ariela\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ldelgardilloqvp@xrea.com\nIsDeleted: false\nLastName: Delgardillo\nIsEmailBounced: false\nFirstName: Lauralee\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: dcroal9t4@businessinsider.com\nIsDeleted: false\nLastName: Croal\nIsEmailBounced: false\nFirstName: Devlin\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: dclarageqzb@wordpress.com\nIsDeleted: false\nLastName: Clarage\nIsEmailBounced: false\nFirstName: Dre\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: dthirlwall3jf@taobao.com\nIsDeleted: false\nLastName: Thirlwall\nIsEmailBounced: false\nFirstName: Dareen\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: tkeddie2lj@wiley.com\nIsDeleted: false\nLastName: Keddie\nIsEmailBounced: false\nFirstName: Tandi\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: jrimingtoni3i@istockphoto.com\nIsDeleted: false\nLastName: Rimington\nIsEmailBounced: false\nFirstName: Judy\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: gtroynet@slashdot.org\nIsDeleted: false\nLastName: Troy\nIsEmailBounced: false\nFirstName: Gail\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: ebunneyh0n@meetup.com\nIsDeleted: false\nLastName: Bunney\nIsEmailBounced: false\nFirstName: Efren\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: yhaken8p3@slate.com\nIsDeleted: false\nLastName: Haken\nIsEmailBounced: false\nFirstName: Yard\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: nolliffeq6q@biblegateway.com\nIsDeleted: false\nLastName: Olliffe\nIsEmailBounced: false\nFirstName: Nani\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: bgalia9jz@odnoklassniki.ru\nIsDeleted: false\nLastName: Galia\nIsEmailBounced: false\nFirstName: Berrie\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: djedrzej3v1@google.com\nIsDeleted: false\nLastName: Jedrzej\nIsEmailBounced: false\nFirstName: Deanne\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: mcamiesh1t@fc2.com\nIsDeleted: false\nLastName: Camies\nIsEmailBounced: false\nFirstName: Mikaela\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: csunshineqni@state.tx.us\nIsDeleted: false\nLastName: Sunshine\nIsEmailBounced: false\nFirstName: Curtis\nIsPriorityRecord: false\nCleanStatus: Pending",
|
||||
"Email: fiannellib46@marriott.com\nIsDeleted: false\nLastName: Iannelli\nIsEmailBounced: false\nFirstName: Felicio\nIsPriorityRecord: false\nCleanStatus: Pending"
|
||||
"BillingPostalCode: 60601\nType: Prospect\nWebsite: www.globalistindustries.com\nBillingCity: Chicago\nDescription: Globalist company\nIsDeleted: false\nIsPartner: false\nPhone: (312) 555-0456\nShippingCountry: USA\nShippingState: IL\nIsBuyer: false\nBillingCountry: USA\nBillingState: IL\nShippingPostalCode: 60601\nBillingStreet: 456 Market St\nIsCustomerPortal: false\nPersonActiveTrackerCount: 0\nShippingCity: Chicago\nShippingStreet: 456 Market St",
|
||||
"FirstName: Michael\nMailingCountry: USA\nActiveTrackerCount: 0\nEmail: m.brown@globalindustries.com\nMailingState: IL\nMailingStreet: 456 Market St\nMailingCity: Chicago\nLastName: Brown\nTitle: CTO\nIsDeleted: false\nPhone: (312) 555-0456\nHasOptedOutOfEmail: false\nIsEmailBounced: false\nMailingPostalCode: 60601",
|
||||
"ForecastCategory: Closed\nName: Global Industries Equipment Sale\nIsDeleted: false\nForecastCategoryName: Closed\nFiscalYear: 2024\nFiscalQuarter: 4\nIsClosed: true\nIsWon: true\nAmount: 5000000.0\nProbability: 100.0\nPushCount: 0\nHasOverdueTask: false\nStageName: Closed Won\nHasOpenActivity: false\nHasOpportunityLineItem: false",
|
||||
"Field: created\nDataType: Text\nIsDeleted: false"
|
||||
],
|
||||
"semantic_identifier": "Voonder",
|
||||
"semantic_identifier": "Unknown Object",
|
||||
"metadata": {},
|
||||
"primary_owners": {"email": "hagen@danswer.ai", "first_name": "Hagen", "last_name": "oneill"},
|
||||
"primary_owners": null,
|
||||
"secondary_owners": null,
|
||||
"title": null
|
||||
}
|
||||
|
||||
@@ -444,7 +444,6 @@ class CCPairManager:
|
||||
)
|
||||
if group_sync_result.status_code != 409:
|
||||
group_sync_result.raise_for_status()
|
||||
time.sleep(2)
|
||||
|
||||
@staticmethod
|
||||
def get_doc_sync_task(
|
||||
|
||||
@@ -14,8 +14,9 @@ from tests.integration.connector_job_tests.slack.slack_api_utils import SlackMan
|
||||
@pytest.fixture()
|
||||
def slack_test_setup() -> Generator[tuple[dict[str, Any], dict[str, Any]], None, None]:
|
||||
slack_client = SlackManager.get_slack_client(os.environ["SLACK_BOT_TOKEN"])
|
||||
user_map = SlackManager.build_slack_user_email_id_map(slack_client)
|
||||
admin_user_id = user_map["admin@onyx-test.com"]
|
||||
admin_user_id = SlackManager.build_slack_user_email_id_map(slack_client)[
|
||||
"admin@onyx-test.com"
|
||||
]
|
||||
|
||||
(
|
||||
public_channel,
|
||||
|
||||
@@ -3,6 +3,8 @@ from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.server.documents.models import DocumentSource
|
||||
@@ -23,6 +25,7 @@ from tests.integration.common_utils.vespa import vespa_fixture
|
||||
from tests.integration.connector_job_tests.slack.slack_api_utils import SlackManager
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="flaky - see DAN-789 for example", strict=False)
|
||||
def test_slack_permission_sync(
|
||||
reset: None,
|
||||
vespa_client: vespa_fixture,
|
||||
@@ -218,6 +221,7 @@ def test_slack_permission_sync(
|
||||
assert private_message not in onyx_doc_message_strings
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="flaky", strict=False)
|
||||
def test_slack_group_permission_sync(
|
||||
reset: None,
|
||||
vespa_client: vespa_fixture,
|
||||
|
||||
@@ -5,11 +5,8 @@ from ee.onyx.external_permissions.salesforce.postprocessing import (
|
||||
)
|
||||
from onyx.configs.app_configs import BLURB_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.salesforce.utils import BASE_DATA_PATH
|
||||
from onyx.context.search.models import InferenceChunk
|
||||
|
||||
SQLITE_DIR = BASE_DATA_PATH
|
||||
|
||||
|
||||
def create_test_chunk(
|
||||
doc_id: str,
|
||||
@@ -42,7 +39,6 @@ def create_test_chunk(
|
||||
|
||||
def test_validate_salesforce_access_single_object() -> None:
|
||||
"""Test filtering when chunk has a single Salesforce object reference"""
|
||||
|
||||
section = "This is a test document about a Salesforce object."
|
||||
test_content = section
|
||||
test_chunk = create_test_chunk(
|
||||
|
||||
@@ -113,18 +113,15 @@ _VALID_SALESFORCE_IDS = [
|
||||
]
|
||||
|
||||
|
||||
def _clear_sf_db(directory: str) -> None:
|
||||
def _clear_sf_db() -> None:
|
||||
"""
|
||||
Clears the SF DB by deleting all files in the data directory.
|
||||
"""
|
||||
shutil.rmtree(directory, ignore_errors=True)
|
||||
shutil.rmtree(BASE_DATA_PATH, ignore_errors=True)
|
||||
|
||||
|
||||
def _create_csv_file(
|
||||
directory: str,
|
||||
object_type: str,
|
||||
records: list[dict],
|
||||
filename: str = "test_data.csv",
|
||||
object_type: str, records: list[dict], filename: str = "test_data.csv"
|
||||
) -> None:
|
||||
"""
|
||||
Creates a CSV file for the given object type and records.
|
||||
@@ -152,10 +149,10 @@ def _create_csv_file(
|
||||
writer.writerow(record)
|
||||
|
||||
# Update the database with the CSV
|
||||
update_sf_db_with_csv(directory, object_type, csv_path)
|
||||
update_sf_db_with_csv(object_type, csv_path)
|
||||
|
||||
|
||||
def _create_csv_with_example_data(directory: str) -> None:
|
||||
def _create_csv_with_example_data() -> None:
|
||||
"""
|
||||
Creates CSV files with example data, organized by object type.
|
||||
"""
|
||||
@@ -345,10 +342,10 @@ def _create_csv_with_example_data(directory: str) -> None:
|
||||
|
||||
# Create CSV files for each object type
|
||||
for object_type, records in example_data.items():
|
||||
_create_csv_file(directory, object_type, records)
|
||||
_create_csv_file(object_type, records)
|
||||
|
||||
|
||||
def _test_query(directory: str) -> None:
|
||||
def _test_query() -> None:
|
||||
"""
|
||||
Tests querying functionality by verifying:
|
||||
1. All expected Account IDs are found
|
||||
@@ -404,7 +401,7 @@ def _test_query(directory: str) -> None:
|
||||
}
|
||||
|
||||
# Get all Account IDs
|
||||
account_ids = find_ids_by_type(directory, "Account")
|
||||
account_ids = find_ids_by_type("Account")
|
||||
|
||||
# Verify we found all expected accounts
|
||||
assert len(account_ids) == len(
|
||||
@@ -416,7 +413,7 @@ def _test_query(directory: str) -> None:
|
||||
|
||||
# Verify each account's data
|
||||
for acc_id in account_ids:
|
||||
combined = get_record(directory, acc_id)
|
||||
combined = get_record(acc_id)
|
||||
assert combined is not None, f"Could not find account {acc_id}"
|
||||
|
||||
expected = expected_accounts[acc_id]
|
||||
@@ -431,7 +428,7 @@ def _test_query(directory: str) -> None:
|
||||
print("All query tests passed successfully!")
|
||||
|
||||
|
||||
def _test_upsert(directory: str) -> None:
|
||||
def _test_upsert() -> None:
|
||||
"""
|
||||
Tests upsert functionality by:
|
||||
1. Updating an existing account
|
||||
@@ -456,10 +453,10 @@ def _test_upsert(directory: str) -> None:
|
||||
},
|
||||
]
|
||||
|
||||
_create_csv_file(directory, "Account", update_data, "update_data.csv")
|
||||
_create_csv_file("Account", update_data, "update_data.csv")
|
||||
|
||||
# Verify the update worked
|
||||
updated_record = get_record(directory, _VALID_SALESFORCE_IDS[0])
|
||||
updated_record = get_record(_VALID_SALESFORCE_IDS[0])
|
||||
assert updated_record is not None, "Updated record not found"
|
||||
assert updated_record.data["Name"] == "Acme Inc. Updated", "Name not updated"
|
||||
assert (
|
||||
@@ -467,7 +464,7 @@ def _test_upsert(directory: str) -> None:
|
||||
), "Description not added"
|
||||
|
||||
# Verify the new record was created
|
||||
new_record = get_record(directory, _VALID_SALESFORCE_IDS[2])
|
||||
new_record = get_record(_VALID_SALESFORCE_IDS[2])
|
||||
assert new_record is not None, "New record not found"
|
||||
assert new_record.data["Name"] == "New Company Inc.", "New record name incorrect"
|
||||
assert new_record.data["AnnualRevenue"] == "1000000", "New record revenue incorrect"
|
||||
@@ -475,7 +472,7 @@ def _test_upsert(directory: str) -> None:
|
||||
print("All upsert tests passed successfully!")
|
||||
|
||||
|
||||
def _test_relationships(directory: str) -> None:
|
||||
def _test_relationships() -> None:
|
||||
"""
|
||||
Tests relationship shelf updates and queries by:
|
||||
1. Creating test data with relationships
|
||||
@@ -516,11 +513,11 @@ def _test_relationships(directory: str) -> None:
|
||||
|
||||
# Create and update CSV files for each object type
|
||||
for object_type, records in test_data.items():
|
||||
_create_csv_file(directory, object_type, records, "relationship_test.csv")
|
||||
_create_csv_file(object_type, records, "relationship_test.csv")
|
||||
|
||||
# Test relationship queries
|
||||
# All these objects should be children of Acme Inc.
|
||||
child_ids = get_child_ids(directory, _VALID_SALESFORCE_IDS[0])
|
||||
child_ids = get_child_ids(_VALID_SALESFORCE_IDS[0])
|
||||
assert len(child_ids) == 4, f"Expected 4 child objects, found {len(child_ids)}"
|
||||
assert _VALID_SALESFORCE_IDS[13] in child_ids, "Case 1 not found in relationship"
|
||||
assert _VALID_SALESFORCE_IDS[14] in child_ids, "Case 2 not found in relationship"
|
||||
@@ -530,7 +527,7 @@ def _test_relationships(directory: str) -> None:
|
||||
), "Opportunity not found in relationship"
|
||||
|
||||
# Test querying relationships for a different account (should be empty)
|
||||
other_account_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[1])
|
||||
other_account_children = get_child_ids(_VALID_SALESFORCE_IDS[1])
|
||||
assert (
|
||||
len(other_account_children) == 0
|
||||
), "Expected no children for different account"
|
||||
@@ -538,7 +535,7 @@ def _test_relationships(directory: str) -> None:
|
||||
print("All relationship tests passed successfully!")
|
||||
|
||||
|
||||
def _test_account_with_children(directory: str) -> None:
|
||||
def _test_account_with_children() -> None:
|
||||
"""
|
||||
Tests querying all accounts and retrieving their child objects.
|
||||
This test verifies that:
|
||||
@@ -547,16 +544,16 @@ def _test_account_with_children(directory: str) -> None:
|
||||
3. Child object data is complete and accurate
|
||||
"""
|
||||
# First get all account IDs
|
||||
account_ids = find_ids_by_type(directory, "Account")
|
||||
account_ids = find_ids_by_type("Account")
|
||||
assert len(account_ids) > 0, "No accounts found"
|
||||
|
||||
# For each account, get its children and verify the data
|
||||
for account_id in account_ids:
|
||||
account = get_record(directory, account_id)
|
||||
account = get_record(account_id)
|
||||
assert account is not None, f"Could not find account {account_id}"
|
||||
|
||||
# Get all child objects
|
||||
child_ids = get_child_ids(directory, account_id)
|
||||
child_ids = get_child_ids(account_id)
|
||||
|
||||
# For Acme Inc., verify specific relationships
|
||||
if account_id == _VALID_SALESFORCE_IDS[0]: # Acme Inc.
|
||||
@@ -567,7 +564,7 @@ def _test_account_with_children(directory: str) -> None:
|
||||
# Get all child records
|
||||
child_records = []
|
||||
for child_id in child_ids:
|
||||
child_record = get_record(directory, child_id)
|
||||
child_record = get_record(child_id)
|
||||
if child_record is not None:
|
||||
child_records.append(child_record)
|
||||
# Verify Cases
|
||||
@@ -602,7 +599,7 @@ def _test_account_with_children(directory: str) -> None:
|
||||
print("All account with children tests passed successfully!")
|
||||
|
||||
|
||||
def _test_relationship_updates(directory: str) -> None:
|
||||
def _test_relationship_updates() -> None:
|
||||
"""
|
||||
Tests that relationships are properly updated when a child object's parent reference changes.
|
||||
This test verifies:
|
||||
@@ -619,10 +616,10 @@ def _test_relationship_updates(directory: str) -> None:
|
||||
"LastName": "Contact",
|
||||
}
|
||||
]
|
||||
_create_csv_file(directory, "Contact", initial_contact, "initial_contact.csv")
|
||||
_create_csv_file("Contact", initial_contact, "initial_contact.csv")
|
||||
|
||||
# Verify initial relationship
|
||||
acme_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[0])
|
||||
acme_children = get_child_ids(_VALID_SALESFORCE_IDS[0])
|
||||
assert (
|
||||
_VALID_SALESFORCE_IDS[40] in acme_children
|
||||
), "Initial relationship not created"
|
||||
@@ -636,22 +633,22 @@ def _test_relationship_updates(directory: str) -> None:
|
||||
"LastName": "Contact",
|
||||
}
|
||||
]
|
||||
_create_csv_file(directory, "Contact", updated_contact, "updated_contact.csv")
|
||||
_create_csv_file("Contact", updated_contact, "updated_contact.csv")
|
||||
|
||||
# Verify old relationship is removed
|
||||
acme_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[0])
|
||||
acme_children = get_child_ids(_VALID_SALESFORCE_IDS[0])
|
||||
assert (
|
||||
_VALID_SALESFORCE_IDS[40] not in acme_children
|
||||
), "Old relationship not removed"
|
||||
|
||||
# Verify new relationship is created
|
||||
globex_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[1])
|
||||
globex_children = get_child_ids(_VALID_SALESFORCE_IDS[1])
|
||||
assert _VALID_SALESFORCE_IDS[40] in globex_children, "New relationship not created"
|
||||
|
||||
print("All relationship update tests passed successfully!")
|
||||
|
||||
|
||||
def _test_get_affected_parent_ids(directory: str) -> None:
|
||||
def _test_get_affected_parent_ids() -> None:
|
||||
"""
|
||||
Tests get_affected_parent_ids functionality by verifying:
|
||||
1. IDs that are directly in the parent_types list are included
|
||||
@@ -686,13 +683,13 @@ def _test_get_affected_parent_ids(directory: str) -> None:
|
||||
|
||||
# Create and update CSV files for test data
|
||||
for object_type, records in test_data.items():
|
||||
_create_csv_file(directory, object_type, records)
|
||||
_create_csv_file(object_type, records)
|
||||
|
||||
# Test Case 1: Account directly in updated_ids and parent_types
|
||||
updated_ids = [_VALID_SALESFORCE_IDS[1]] # Parent Account 2
|
||||
parent_types = ["Account"]
|
||||
affected_ids_by_type = dict(
|
||||
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
|
||||
get_affected_parent_ids_by_type(updated_ids, parent_types)
|
||||
)
|
||||
assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
|
||||
assert (
|
||||
@@ -703,7 +700,7 @@ def _test_get_affected_parent_ids(directory: str) -> None:
|
||||
updated_ids = [_VALID_SALESFORCE_IDS[40]] # Child Contact
|
||||
parent_types = ["Account"]
|
||||
affected_ids_by_type = dict(
|
||||
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
|
||||
get_affected_parent_ids_by_type(updated_ids, parent_types)
|
||||
)
|
||||
assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
|
||||
assert (
|
||||
@@ -714,7 +711,7 @@ def _test_get_affected_parent_ids(directory: str) -> None:
|
||||
updated_ids = [_VALID_SALESFORCE_IDS[1], _VALID_SALESFORCE_IDS[40]] # Both cases
|
||||
parent_types = ["Account"]
|
||||
affected_ids_by_type = dict(
|
||||
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
|
||||
get_affected_parent_ids_by_type(updated_ids, parent_types)
|
||||
)
|
||||
assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
|
||||
affected_ids = affected_ids_by_type["Account"]
|
||||
@@ -729,7 +726,7 @@ def _test_get_affected_parent_ids(directory: str) -> None:
|
||||
updated_ids = [_VALID_SALESFORCE_IDS[40]] # Child Contact
|
||||
parent_types = ["Opportunity"] # Wrong type
|
||||
affected_ids_by_type = dict(
|
||||
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
|
||||
get_affected_parent_ids_by_type(updated_ids, parent_types)
|
||||
)
|
||||
assert len(affected_ids_by_type) == 0, "Should return empty dict when no matches"
|
||||
|
||||
@@ -737,15 +734,13 @@ def _test_get_affected_parent_ids(directory: str) -> None:
|
||||
|
||||
|
||||
def test_salesforce_sqlite() -> None:
|
||||
directory = BASE_DATA_PATH
|
||||
|
||||
_clear_sf_db(directory)
|
||||
init_db(directory)
|
||||
_create_csv_with_example_data(directory)
|
||||
_test_query(directory)
|
||||
_test_upsert(directory)
|
||||
_test_relationships(directory)
|
||||
_test_account_with_children(directory)
|
||||
_test_relationship_updates(directory)
|
||||
_test_get_affected_parent_ids(directory)
|
||||
_clear_sf_db(directory)
|
||||
_clear_sf_db()
|
||||
init_db()
|
||||
_create_csv_with_example_data()
|
||||
_test_query()
|
||||
_test_upsert()
|
||||
_test_relationships()
|
||||
_test_account_with_children()
|
||||
_test_relationship_updates()
|
||||
_test_get_affected_parent_ids()
|
||||
_clear_sf_db()
|
||||
|
||||
@@ -1,529 +0,0 @@
|
||||
Company,Link
|
||||
1849-bio,https://x.com/1849bio
|
||||
1stcollab,https://twitter.com/ycombinator
|
||||
abundant,https://x.com/abundant_labs
|
||||
activepieces,https://mobile.twitter.com/mabuaboud
|
||||
acx,https://twitter.com/ycombinator
|
||||
adri-ai,https://twitter.com/darshitac_
|
||||
affil-ai,https://twitter.com/ycombinator
|
||||
agave,https://twitter.com/moyicat
|
||||
aglide,https://twitter.com/pdmcguckian
|
||||
ai-2,https://twitter.com/the_yuppy
|
||||
ai-sell,https://x.com/liuzjerry
|
||||
airtrain-ai,https://twitter.com/neutralino1
|
||||
aisdr,https://twitter.com/YuriyZaremba
|
||||
alex,https://x.com/DanielEdrisian
|
||||
alga-biosciences,https://twitter.com/algabiosciences
|
||||
alguna,https://twitter.com/aleks_djekic
|
||||
alixia,https://twitter.com/ycombinator
|
||||
aminoanalytica,https://x.com/lilwuuzivert
|
||||
anara,https://twitter.com/naveedjanmo
|
||||
andi,https://twitter.com/MiamiAngela
|
||||
andoria,https://x.com/dbudimane
|
||||
andromeda-surgical,https://twitter.com/nickdamian0
|
||||
anglera,https://twitter.com/ycombinator
|
||||
angstrom-ai,https://twitter.com/JaviAC7
|
||||
ankr-health,https://twitter.com/Ankr_us
|
||||
apoxy,https://twitter.com/ycombinator
|
||||
apten,https://twitter.com/dho1357
|
||||
aragorn-ai,https://twitter.com/ycombinator
|
||||
arc-2,https://twitter.com/DarkMirage
|
||||
archilabs,https://twitter.com/ycombinator
|
||||
arcimus,https://twitter.com/husseinsyed73
|
||||
argovox,https://www.argovox.com/
|
||||
artemis-search,https://twitter.com/ycombinator
|
||||
artie,https://x.com/JacquelineSYC19
|
||||
asklio,https://twitter.com/butterflock
|
||||
atlas-2,https://twitter.com/jobryan
|
||||
attain,https://twitter.com/aamir_hudda
|
||||
autocomputer,https://twitter.com/madhavsinghal_
|
||||
automat,https://twitter.com/lucas0choa
|
||||
automorphic,https://twitter.com/sandkoan
|
||||
autopallet-robotics,https://twitter.com/ycombinator
|
||||
autumn-labs,https://twitter.com/ycombinator
|
||||
aviary,https://twitter.com/ycombinator
|
||||
azuki,https://twitter.com/VamptVo
|
||||
banabo,https://twitter.com/ycombinator
|
||||
baseline-ai,https://twitter.com/ycombinator
|
||||
baserun,https://twitter.com/effyyzhang
|
||||
benchify,https://www.x.com/maxvonhippel
|
||||
berry,https://twitter.com/annchanyt
|
||||
bifrost,https://twitter.com/0xMysterious
|
||||
bifrost-orbital,https://x.com/ionkarbatra
|
||||
biggerpicture,https://twitter.com/ycombinator
|
||||
biocartesian,https://twitter.com/ycombinator
|
||||
bland-ai,https://twitter.com/zaygranet
|
||||
blast,https://x.com/useblast
|
||||
blaze,https://twitter.com/larfy_rothwell
|
||||
bluebirds,https://twitter.com/RohanPunamia
|
||||
bluedot,https://twitter.com/selinayfilizp
|
||||
bluehill-payments,https://twitter.com/HimanshuMinocha
|
||||
blyss,https://twitter.com/blyssdev
|
||||
bolto,https://twitter.com/mrinalsingh02?lang=en
|
||||
botcity,https://twitter.com/lorhancaproni
|
||||
boundo,https://twitter.com/ycombinator
|
||||
bramble,https://x.com/meksikanpijha
|
||||
bricksai,https://twitter.com/ycombinator
|
||||
broccoli-ai,https://twitter.com/abhishekjain25
|
||||
bronco-ai,https://twitter.com/dluozhang
|
||||
bunting-labs,https://twitter.com/normconstant
|
||||
byterat,https://twitter.com/penelopekjones_
|
||||
callback,https://twitter.com/ycombinator
|
||||
cambio-2,https://twitter.com/ycombinator
|
||||
camfer,https://x.com/AryaBastani
|
||||
campfire-2,https://twitter.com/ycombinator
|
||||
campfire-applied-ai-company,https://twitter.com/siamakfr
|
||||
candid,https://x.com/kesavkosana
|
||||
canvas,https://x.com/essamsleiman
|
||||
capsule,https://twitter.com/kelsey_pedersen
|
||||
cardinal,http://twitter.com/nadavwiz
|
||||
cardinal-gray,https://twitter.com/ycombinator
|
||||
cargo,https://twitter.com/aureeaubert
|
||||
cartage,https://twitter.com/ycombinator
|
||||
cashmere,https://twitter.com/shashankbuilds
|
||||
cedalio,https://twitter.com/LucianaReznik
|
||||
cekura-2,https://x.com/tarush_agarwal_
|
||||
central,https://twitter.com/nilaymod
|
||||
champ,https://twitter.com/ycombinator
|
||||
cheers,https://twitter.com/ycombinator
|
||||
chequpi,https://twitter.com/sudshekhar02
|
||||
chima,https://twitter.com/nikharanirghin
|
||||
cinapse,https://www.twitter.com/hgphillipsiv
|
||||
ciro,https://twitter.com/davidjwiner
|
||||
clara,https://x.com/levinsonjon
|
||||
cleancard,https://twitter.com/_tom_dot_com
|
||||
clearspace,https://twitter.com/rbfasho
|
||||
cobbery,https://twitter.com/Dan_The_Goodman
|
||||
codeviz,https://x.com/liam_prev
|
||||
coil-inc,https://twitter.com/ycombinator
|
||||
coldreach,https://twitter.com/ycombinator
|
||||
combinehealth,https://twitter.com/ycombinator
|
||||
comfy-deploy,https://twitter.com/nicholaskkao
|
||||
complete,https://twitter.com/ranimavram
|
||||
conductor-quantum,https://twitter.com/BrandonSeverin
|
||||
conduit,https://twitter.com/ycombinator
|
||||
continue,https://twitter.com/tylerjdunn
|
||||
contour,https://twitter.com/ycombinator
|
||||
coperniq,https://twitter.com/abdullahzandani
|
||||
corgea,https://twitter.com/asadeddin
|
||||
corgi,https://twitter.com/nico_laqua?lang=en
|
||||
corgi-labs,https://twitter.com/ycombinator
|
||||
coris,https://twitter.com/psvinodh
|
||||
cosine,https://twitter.com/AlistairPullen
|
||||
courtyard-io,https://twitter.com/lejeunedall
|
||||
coverage-cat,https://twitter.com/coveragecats
|
||||
craftos,https://twitter.com/wa3l
|
||||
craniometrix,https://craniometrix.com
|
||||
ctgt,https://twitter.com/cyrilgorlla
|
||||
curo,https://x.com/EnergizedAndrew
|
||||
dagworks-inc,https://twitter.com/dagworks
|
||||
dart,https://twitter.com/milad3malek
|
||||
dashdive,https://twitter.com/micahawheat
|
||||
dataleap,https://twitter.com/jh_damm
|
||||
decisional-ai,https://x.com/groovetandon
|
||||
decoda-health,https://twitter.com/ycombinator
|
||||
deepsilicon,https://x.com/abhireddy2004
|
||||
delfino-ai,https://twitter.com/ycombinator
|
||||
demo-gorilla,https://twitter.com/ycombinator
|
||||
demospace,https://www.twitter.com/nick_fiacco
|
||||
dench-com,https://www.twitter.com/markrachapoom
|
||||
denormalized,https://twitter.com/IAmMattGreen
|
||||
dev-tools-ai,https://twitter.com/ycombinator
|
||||
diffusion-studio,https://x.com/MatthiasRuiz22
|
||||
digitalcarbon,https://x.com/CtrlGuruDelete
|
||||
dimely,https://x.com/UseDimely
|
||||
disputeninja,https://twitter.com/legitmaxwu
|
||||
diversion,https://twitter.com/sasham1
|
||||
dmodel,https://twitter.com/dmooooon
|
||||
doctor-droid,https://twitter.com/TheBengaluruGuy
|
||||
dodo,https://x.com/dominik_moehrle
|
||||
dojah-inc,https://twitter.com/ololaday
|
||||
domu-technology-inc,https://twitter.com/ycombinator
|
||||
dr-treat,https://twitter.com/rakeshtondon
|
||||
dreamrp,https://x.com/dreamrpofficial
|
||||
drivingforce,https://twitter.com/drivingforcehq
|
||||
dynamo-ai,https://twitter.com/dynamo_fl
|
||||
edgebit,https://twitter.com/robszumski
|
||||
educato-ai,https://x.com/FelixGabler
|
||||
electric-air-2,https://twitter.com/JezOsborne
|
||||
ember,https://twitter.com/hsinleiwang
|
||||
ember-robotics,https://twitter.com/ycombinator
|
||||
emergent,https://twitter.com/mukundjha
|
||||
emobi,https://twitter.com/ycombinator
|
||||
entangl,https://twitter.com/Shapol_m
|
||||
envelope,https://twitter.com/joshuakcockrell
|
||||
et-al,https://twitter.com/ycombinator
|
||||
eugit-therapeutics,http://www.eugittx.com
|
||||
eventual,https://twitter.com/sammy_sidhu
|
||||
evoly,https://twitter.com/ycombinator
|
||||
expand-ai,https://twitter.com/timsuchanek
|
||||
ezdubs,https://twitter.com/PadmanabhanKri
|
||||
fabius,https://twitter.com/adayNU
|
||||
fazeshift,https://twitter.com/ycombinator
|
||||
felafax,https://twitter.com/ThatNithin
|
||||
fetchr,https://twitter.com/CalvinnChenn
|
||||
fiber-ai,https://twitter.com/AdiAgashe
|
||||
ficra,https://x.com/ficra_ai
|
||||
fiddlecube,https://twitter.com/nupoor_neha
|
||||
finic,https://twitter.com/jfan001
|
||||
finta,https://www.twitter.com/andywang
|
||||
fintool,https://twitter.com/nicbstme
|
||||
finvest,https://twitter.com/shivambharuka
|
||||
firecrawl,https://x.com/ericciarla
|
||||
firstwork,https://twitter.com/techie_Shubham
|
||||
fixa,https://x.com/jonathanzliu
|
||||
flair-health,https://twitter.com/adivawhocodes
|
||||
fleek,https://twitter.com/ycombinator
|
||||
fleetworks,https://twitter.com/ycombinator
|
||||
flike,https://twitter.com/yajmch
|
||||
flint-2,https://twitter.com/hungrysohan
|
||||
floworks,https://twitter.com/sarthaks92
|
||||
focus-buddy,https://twitter.com/yash14700/
|
||||
forerunner-ai,https://x.com/willnida0
|
||||
founders,https://twitter.com/ycombinator
|
||||
foundry,https://x.com/FoundryAI_
|
||||
freestyle,https://x.com/benswerd
|
||||
fresco,https://twitter.com/ycombinator
|
||||
friday,https://x.com/AllenNaliath
|
||||
frigade,https://twitter.com/FrigadeHQ
|
||||
futureclinic,https://twitter.com/usamasyedmd
|
||||
gait,https://twitter.com/AlexYHsia
|
||||
galini,https://twitter.com/ycombinator
|
||||
gauge,https://twitter.com/the1024th
|
||||
gecko-security,https://x.com/jjjutla
|
||||
general-analysis,https://twitter.com/ycombinator
|
||||
giga-ml,https://twitter.com/varunvummadi
|
||||
glade,https://twitter.com/ycombinator
|
||||
glass-health,https://twitter.com/dereckwpaul
|
||||
goodfin,https://twitter.com/ycombinator
|
||||
grai,https://twitter.com/ycombinator
|
||||
greenlite,https://twitter.com/will_lawrenceTO
|
||||
grey,https://www.twitter.com/kingidee
|
||||
happyrobot,https://twitter.com/pablorpalafox
|
||||
haystack-software,https://x.com/AkshaySubr42403
|
||||
health-harbor,https://twitter.com/AlanLiu96
|
||||
healthspark,https://twitter.com/stephengrinich
|
||||
hedgehog-2,https://twitter.com/ycombinator
|
||||
helicone,https://twitter.com/justinstorre
|
||||
heroui,https://x.com/jrgarciadev
|
||||
hoai,https://twitter.com/ycombinator
|
||||
hockeystack,https://twitter.com/ycombinator
|
||||
hokali,https://twitter.com/hokalico
|
||||
homeflow,https://twitter.com/ycombinator
|
||||
hubble-network,https://twitter.com/BenWild10
|
||||
humand,https://twitter.com/nicolasbenenzon
|
||||
humanlayer,https://twitter.com/dexhorthy
|
||||
hydra,https://twitter.com/JoeSciarrino
|
||||
hyperbound,https://twitter.com/sguduguntla
|
||||
ideate-xyz,https://twitter.com/nomocodes
|
||||
inbuild,https://twitter.com/TySharp_iB
|
||||
indexical,https://twitter.com/try_nebula
|
||||
industrial-next,https://twitter.com/ycombinator
|
||||
infisical,https://twitter.com/matsiiako
|
||||
inkeep,https://twitter.com/nickgomezc
|
||||
inlet-2,https://twitter.com/inlet_ai
|
||||
innkeeper,https://twitter.com/tejasybhakta
|
||||
instant,https://twitter.com/JoeAverbukh
|
||||
integrated-reasoning,https://twitter.com/d4r5c2
|
||||
interlock,https://twitter.com/ycombinator
|
||||
intryc,https://x.com/alexmarantelos?lang=en
|
||||
invert,https://twitter.com/purrmin
|
||||
iollo,https://twitter.com/daniel_gomari
|
||||
jamble,https://twitter.com/ycombinator
|
||||
joon-health,https://twitter.com/IsaacVanEaves
|
||||
juicebox,https://twitter.com/davepaffenholz
|
||||
julius,https://twitter.com/0interestrates
|
||||
karmen,https://twitter.com/ycombinator
|
||||
kenley,https://x.com/KenleyAI
|
||||
keylika,https://twitter.com/buddhachaudhuri
|
||||
khoj,https://twitter.com/debanjum
|
||||
kite,https://twitter.com/DerekFeehrer
|
||||
kivo-health,https://twitter.com/vaughnkoch
|
||||
knowtex,https://twitter.com/CarolineCZhang
|
||||
koala,https://twitter.com/studioseinstein?s=11
|
||||
kopra-bio,https://x.com/AF_Haddad
|
||||
kura,https://x.com/kura_labs
|
||||
laminar,https://twitter.com/skull8888888888
|
||||
lancedb,https://twitter.com/changhiskhan
|
||||
latent,https://twitter.com/ycombinator
|
||||
layerup,https://twitter.com/arnavbathla20
|
||||
lazyeditor,https://twitter.com/jee_cash
|
||||
ledgerup,https://twitter.com/josephrjohnson
|
||||
lifelike,https://twitter.com/alecxiang1
|
||||
lighthouz-ai,https://x.com/srijankedia
|
||||
lightski,https://www.twitter.com/hansenq
|
||||
ligo-biosciences,https://x.com/ArdaGoreci/status/1830744265007480934
|
||||
line-build,https://twitter.com/ycombinator
|
||||
lingodotdev,https://twitter.com/maxprilutskiy
|
||||
linkgrep,https://twitter.com/linkgrep
|
||||
linum,https://twitter.com/schopra909
|
||||
livedocs,https://twitter.com/arsalanbashir
|
||||
luca,https://twitter.com/LucaPricingHq
|
||||
lumenary,https://twitter.com/vivekhaz
|
||||
lune,https://x.com/samuelp4rk
|
||||
lynx,https://twitter.com/ycombinator
|
||||
magic-loops,https://twitter.com/jumploops
|
||||
manaflow,https://twitter.com/austinywang
|
||||
mandel-ai,https://twitter.com/shmkkr
|
||||
martin,https://twitter.com/martinvoiceai
|
||||
matano,https://twitter.com/AhmedSamrose
|
||||
mdhub,https://twitter.com/ealamolda
|
||||
mederva-health,http://twitter.com/sabihmir
|
||||
medplum,https://twitter.com/ReshmaKhilnani
|
||||
melty,https://x.com/charliebholtz
|
||||
mem0,https://twitter.com/taranjeetio
|
||||
mercator,https://www.twitter.com/ajdstein
|
||||
mercoa,https://twitter.com/Sarora27
|
||||
meru,https://twitter.com/rohanarora_
|
||||
metalware,https://twitter.com/ryanchowww
|
||||
metriport,https://twitter.com/dimagoncharov_
|
||||
mica-ai,https://twitter.com/ycombinator
|
||||
middleware,https://twitter.com/laduramvishnoi
|
||||
midship,https://twitter.com/_kietay
|
||||
mintlify,https://twitter.com/hanwangio
|
||||
minusx,https://twitter.com/nuwandavek
|
||||
miracle,https://twitter.com/ycombinator
|
||||
miru-ml,https://twitter.com/armelwtalla
|
||||
mito-health,https://twitter.com/teemingchew
|
||||
mocha,https://twitter.com/nichochar
|
||||
modern-realty,https://x.com/RIsanians
|
||||
modulari-t,https://twitter.com/ycombinator
|
||||
mogara,https://twitter.com/ycombinator
|
||||
monterey-ai,https://twitter.com/chunonline
|
||||
moonglow,https://twitter.com/leilavclark
|
||||
moonshine,https://x.com/useMoonshine
|
||||
moreta,https://twitter.com/ycombinator
|
||||
mutable-ai,https://x.com/smahsramo
|
||||
myria,https://twitter.com/reyflemings
|
||||
nango,https://twitter.com/rguldener
|
||||
nanograb,https://twitter.com/lauhoyeung
|
||||
nara,https://twitter.com/join_nara
|
||||
narrative,https://twitter.com/axitkhurana
|
||||
nectar,https://twitter.com/AllenWang314
|
||||
neosync,https://twitter.com/evisdrenova
|
||||
nerve,https://x.com/fortress_build
|
||||
networkocean,https://twitter.com/sammendel4
|
||||
ngrow-ai,https://twitter.com/ycombinator
|
||||
no-cap,https://x.com/nocapso
|
||||
nowadays,https://twitter.com/ycombinator
|
||||
numeral,https://www.twitter.com/mduvall_
|
||||
obento-health,https://twitter.com/ycombinator
|
||||
octopipe,https://twitter.com/abhishekray07
|
||||
odo,https://twitter.com/ycombinator
|
||||
ofone,https://twitter.com/ycombinator
|
||||
onetext,http://twitter.com/jfudem
|
||||
openfunnel,https://x.com/fenilsuchak
|
||||
opensight,https://twitter.com/OpenSightAI
|
||||
ora-ai,https://twitter.com/ryan_rl_phelps
|
||||
orchid,https://twitter.com/ycombinator
|
||||
origami-agents,https://x.com/fin465
|
||||
outerbase,https://www.twitter.com/burcs
|
||||
outerport,https://x.com/yongyuanxi
|
||||
outset,https://twitter.com/AaronLCannon
|
||||
overeasy,https://twitter.com/skyflylu
|
||||
overlap,https://x.com/jbaerofficial
|
||||
oway,https://twitter.com/owayinc
|
||||
ozone,https://twitter.com/maxvwolff
|
||||
pair-ai,https://twitter.com/ycombinator
|
||||
palmier,https://twitter.com/ycombinator
|
||||
panora,https://twitter.com/rflih_
|
||||
parabolic,https://twitter.com/ycombinator
|
||||
paragon-ai,https://twitter.com/ycombinator
|
||||
parahelp,https://twitter.com/ankerbachryhl
|
||||
parity,https://x.com/wilson_spearman
|
||||
parley,https://twitter.com/ycombinator
|
||||
patched,https://x.com/rohan_sood15
|
||||
pearson-labs,https://twitter.com/ycombinator
|
||||
pelm,https://twitter.com/ycombinator
|
||||
penguin-ai,https://twitter.com/ycombinator
|
||||
peoplebox,https://twitter.com/abhichugh
|
||||
permitflow,https://twitter.com/ycombinator
|
||||
permitportal,https://twitter.com/rgmazilu
|
||||
persana-ai,https://www.twitter.com/tweetsreez
|
||||
pharos,https://x.com/felix_brann
|
||||
phind,https://twitter.com/michaelroyzen
|
||||
phonely,https://x.com/phonely_ai
|
||||
pier,https://twitter.com/ycombinator
|
||||
pierre,https://twitter.com/fat
|
||||
pinnacle,https://twitter.com/SeanRoades
|
||||
pipeshift,https://x.com/FerraoEnrique
|
||||
pivot,https://twitter.com/raimietang
|
||||
planbase,https://twitter.com/ycombinator
|
||||
plover-parametrics,https://twitter.com/ycombinator
|
||||
plutis,https://twitter.com/kamil_m_ali
|
||||
poka-labs,https://twitter.com/ycombinator
|
||||
poly,https://twitter.com/Denizen_Kane
|
||||
polymath-robotics,https://twitter.com/stefanesa
|
||||
ponyrun,https://twitter.com/ycombinator
|
||||
poplarml,https://twitter.com/dnaliu17
|
||||
posh,https://twitter.com/PoshElectric
|
||||
power-to-the-brand,https://twitter.com/ycombinator
|
||||
primevault,https://twitter.com/prashantupd
|
||||
prohostai,https://twitter.com/bilguunu
|
||||
promptloop,https://twitter.com/PeterbMangan
|
||||
propaya,https://x.com/PropayaOfficial
|
||||
proper,https://twitter.com/kylemaloney_
|
||||
proprise,https://twitter.com/kragerDev
|
||||
protegee,https://x.com/kirthibanothu
|
||||
pump-co,https://www.twitter.com/spndn07/
|
||||
pumpkin,https://twitter.com/SamuelCrombie
|
||||
pure,https://twitter.com/collectpure
|
||||
pylon-2,https://x.com/marty_kausas
|
||||
pyq-ai,https://twitter.com/araghuvanshi2
|
||||
query-vary,https://twitter.com/DJFinetunes
|
||||
rankai,https://x.com/rankai_ai
|
||||
rastro,https://twitter.com/baptiste_cumin
|
||||
reactwise,https://twitter.com/ycombinator
|
||||
read-bean,https://twitter.com/maggieqzhang
|
||||
readily,https://twitter.com/ycombinator
|
||||
redouble-ai,https://twitter.com/pneumaticdill?s=21
|
||||
refine,https://twitter.com/civanozseyhan
|
||||
reflex,https://twitter.com/getreflex
|
||||
reforged-labs,https://twitter.com/ycombinator
|
||||
relace,https://twitter.com/ycombinator
|
||||
relate,https://twitter.com/chrischae__
|
||||
remade,https://x.com/Christos_antono
|
||||
remy,https://twitter.com/ycombinator
|
||||
remy-2,https://x.com/remysearch
|
||||
rentflow,https://twitter.com/ycombinator
|
||||
requestly,https://twitter.com/sachinjain024
|
||||
resend,https://x.com/zenorocha
|
||||
respaid,https://twitter.com/johnbanr
|
||||
reticular,https://x.com/nithinparsan
|
||||
retrofix-ai,https://twitter.com/danieldoesdev
|
||||
revamp,https://twitter.com/getrevamp_ai
|
||||
revyl,https://x.com/landseerenga
|
||||
reworkd,https://twitter.com/asimdotshrestha
|
||||
reworks,https://twitter.com/ycombinator
|
||||
rift,https://twitter.com/FilipTwarowski
|
||||
riskangle,https://twitter.com/ycombinator
|
||||
riskcube,https://x.com/andrei_risk
|
||||
rivet,https://twitter.com/nicholaskissel
|
||||
riveter-ai,https://x.com/AGrillz
|
||||
roame,https://x.com/timtqin
|
||||
roforco,https://x.com/brain_xiang
|
||||
rome,https://twitter.com/craigzLiszt
|
||||
roomplays,https://twitter.com/criyaco
|
||||
rosebud-biosciences,https://twitter.com/KitchenerWilson
|
||||
rowboat-labs,https://twitter.com/segmenta
|
||||
rubber-ducky-labs,https://twitter.com/alexandraj777
|
||||
ruleset,https://twitter.com/LoganFrederick
|
||||
ryvn,https://x.com/ryvnai
|
||||
safetykit,https://twitter.com/ycombinator
|
||||
sage-ai,https://twitter.com/akhilmurthy20
|
||||
saldor,https://x.com/notblandjacob
|
||||
salient,https://twitter.com/ycombinator
|
||||
schemeflow,https://x.com/browninghere
|
||||
sculpt,https://twitter.com/ycombinator
|
||||
seals-ai,https://x.com/luismariogm
|
||||
seis,https://twitter.com/TrevMcKendrick
|
||||
sensei,https://twitter.com/ycombinator
|
||||
sensorsurf,https://twitter.com/noahjepstein
|
||||
sepal-ai,https://www.twitter.com/katqhu1
|
||||
serial,https://twitter.com/Serialmfg
|
||||
serif-health,https://www.twitter.com/mfrobben
|
||||
serra,https://twitter.com/ycombinator
|
||||
shasta-health,https://twitter.com/SrinjoyMajumdar
|
||||
shekel-mobility,https://twitter.com/ShekelMobility
|
||||
shortbread,https://twitter.com/ShortbreadAI
|
||||
showandtell,https://twitter.com/ycombinator
|
||||
sidenote,https://twitter.com/jclin22009
|
||||
sieve,https://twitter.com/mokshith_v
|
||||
silkchart,https://twitter.com/afakerele
|
||||
simple-ai,https://twitter.com/catheryn_li
|
||||
simplehash,https://twitter.com/Alex_Kilkka
|
||||
simplex,https://x.com/simplexdata
|
||||
simplifine,https://x.com/egekduman
|
||||
sizeless,https://twitter.com/cornelius_einem
|
||||
skyvern,https://x.com/itssuchintan
|
||||
slingshot,https://twitter.com/ycombinator
|
||||
snowpilot,https://x.com/snowpilotai
|
||||
soff,https://x.com/BernhardHausle1
|
||||
solum-health,https://twitter.com/ycombinator
|
||||
sonnet,https://twitter.com/ycombinator
|
||||
sophys,https://twitter.com/ycombinator
|
||||
sorcerer,https://x.com/big_veech
|
||||
soteri-skin,https://twitter.com/SoteriSkin
|
||||
sphere,https://twitter.com/nrudder_
|
||||
spine-ai,https://twitter.com/BudhkarAkshay
|
||||
spongecake,https://twitter.com/ycombinator
|
||||
spur,https://twitter.com/sneha8sivakumar
|
||||
sre-ai,https://twitter.com/ycombinator
|
||||
stably,https://x.com/JinjingLiang
|
||||
stack-ai,https://twitter.com/bernaceituno
|
||||
stellar,https://twitter.com/ycombinator
|
||||
stormy-ai-autonomous-marketing-agent,https://twitter.com/karmedge/
|
||||
strada,https://twitter.com/AmirProd1
|
||||
stream,https://twitter.com/ycombinator
|
||||
structured-labs,https://twitter.com/amruthagujjar
|
||||
studdy,https://twitter.com/mike_lamma
|
||||
subscriptionflow,https://twitter.com/KashifSaleemCEO
|
||||
subsets,https://twitter.com/ycombinator
|
||||
supercontrast,https://twitter.com/ycombinator
|
||||
supertone,https://twitter.com/trysupertone
|
||||
superunit,https://x.com/peter_marler
|
||||
sweep,https://twitter.com/wwzeng1
|
||||
syncly,https://x.com/synclyhq
|
||||
synnax,https://x.com/Emilbon99
|
||||
syntheticfi,https://x.com/SyntheticFi_SF
|
||||
t3-chat-prev-ping-gg,https://twitter.com/t3dotgg
|
||||
tableflow,https://twitter.com/mitchpatin
|
||||
tai,https://twitter.com/Tragen_ai
|
||||
tandem-2,https://x.com/Tandemspace
|
||||
taxgpt,https://twitter.com/ChKashifAli
|
||||
taylor-ai,https://twitter.com/brian_j_kim
|
||||
teamout,https://twitter.com/ycombinator
|
||||
tegon,https://twitter.com/harshithb4h
|
||||
terminal,https://x.com/withterminal
|
||||
theneo,https://twitter.com/robakid
|
||||
theya,https://twitter.com/vikasch
|
||||
thyme,https://twitter.com/ycombinator
|
||||
tiny,https://twitter.com/ycombinator
|
||||
tola,https://twitter.com/alencvisic
|
||||
trainy,https://twitter.com/TrainyAI
|
||||
trendex-we-tokenize-talent,https://twitter.com/ycombinator
|
||||
trueplace,https://twitter.com/ycombinator
|
||||
truewind,https://twitter.com/AlexLee611
|
||||
trusty,https://twitter.com/trustyhomes
|
||||
truva,https://twitter.com/gaurav_aggarwal
|
||||
tuesday,https://twitter.com/kai_jiabo_feng
|
||||
twenty,https://twitter.com/twentycrm
|
||||
twine,https://twitter.com/anandvalavalkar
|
||||
two-dots,https://twitter.com/HensonOrser1
|
||||
typa,https://twitter.com/sounhochung
|
||||
typeless,https://twitter.com/ycombinator
|
||||
unbound,https://twitter.com/ycombinator
|
||||
undermind,https://twitter.com/UndermindAI
|
||||
unison,https://twitter.com/maxim_xyz
|
||||
unlayer,https://twitter.com/adeelraza
|
||||
unstatiq,https://twitter.com/NishSingaraju
|
||||
unusual,https://x.com/willwjack
|
||||
upfront,https://twitter.com/KnowUpfront
|
||||
vaero,https://twitter.com/ycombinator
|
||||
vango-ai,https://twitter.com/vango_ai
|
||||
variance,https://twitter.com/karinemellata
|
||||
variant,https://twitter.com/bnj
|
||||
velos,https://twitter.com/OscarMHBF
|
||||
velt,https://twitter.com/rakesh_goyal
|
||||
vendra,https://x.com/vendraHQ
|
||||
vera-health,https://x.com/_maximall
|
||||
verata,https://twitter.com/ycombinator
|
||||
versive,https://twitter.com/getversive
|
||||
vessel,https://twitter.com/vesselapi
|
||||
vibe,https://twitter.com/ycombinator
|
||||
videogen,https://twitter.com/ycombinator
|
||||
vigilant,https://twitter.com/BenShumaker_
|
||||
vitalize-care,https://twitter.com/nikhiljdsouza
|
||||
viva-labs,https://twitter.com/vishal_the_jain
|
||||
vizly,https://twitter.com/vizlyhq
|
||||
vly-ai-2,https://x.com/victorxheng
|
||||
vocode,https://twitter.com/kianhooshmand
|
||||
void,https://x.com/parel_es
|
||||
voltic,https://twitter.com/ycombinator
|
||||
vooma,https://twitter.com/jessebucks
|
||||
wingback,https://twitter.com/tfriehe_
|
||||
winter,https://twitter.com/AzianMike
|
||||
wolfia,https://twitter.com/narenmano
|
||||
wordware,https://twitter.com/kozerafilip
|
||||
zenbase-ai,https://twitter.com/CyrusOfEden
|
||||
zeropath,https://x.com/zeropathAI
|
||||
|
@@ -1,29 +0,0 @@
|
||||
import csv
|
||||
|
||||
companies = {}
|
||||
|
||||
with open("twitter_links.txt", "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
parts = line.split(":", 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
company, url = parts
|
||||
url = url.strip()
|
||||
|
||||
# Store only the first URL for each company
|
||||
if company not in companies:
|
||||
companies[company] = url
|
||||
|
||||
# Write to CSV
|
||||
with open("company_links.csv", "w", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["Company", "Link"])
|
||||
for company, url in sorted(companies.items()):
|
||||
writer.writerow([company, url])
|
||||
|
||||
print(f"Deduped {len(companies)} companies to company_links.csv")
|
||||
@@ -1,68 +0,0 @@
|
||||
# Onyx AWS ECS Fargate CloudFormation Deployment
|
||||
|
||||
This directory contains CloudFormation templates and scripts to deploy Onyx on AWS ECS Fargate.
|
||||
|
||||
## Configuration
|
||||
|
||||
All configuration parameters are stored in a single JSON file: `onyx_config.json`. This file contains all the parameters needed for the different CloudFormation stacks.
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"OnyxNamespace": "onyx",
|
||||
"Environment": "production",
|
||||
"EFSName": "onyx-efs",
|
||||
"AWSRegion": "us-east-2",
|
||||
"VpcID": "YOUR_VPC_ID",
|
||||
"SubnetIDs": "YOUR_SUBNET_ID1,YOUR_SUBNET_ID2",
|
||||
"DomainName": "YOUR_DOMAIN e.g ecs.onyx.app",
|
||||
"ValidationMethod": "DNS",
|
||||
"HostedZoneId": ""
|
||||
}
|
||||
```
|
||||
|
||||
### Required Parameters
|
||||
|
||||
- `Environment`: Used to prefix all stack names during deployment. This is required.
|
||||
- `OnyxNamespace`: Namespace for the Onyx deployment.
|
||||
- `EFSName`: Name for the Elastic File System.
|
||||
- `AWSRegion`: AWS region where resources will be deployed.
|
||||
- `VpcID`: ID of the VPC where Onyx will be deployed.
|
||||
- `SubnetIDs`: Comma-separated list of subnet IDs for deployment.
|
||||
- `DomainName`: Domain name for the Onyx deployment.
|
||||
- `ValidationMethod`: Method for domain validation (typically "DNS").
|
||||
- [optional] `HostedZoneId`: Route 53 hosted zone ID (only if using Route 53 for DNS).
|
||||
|
||||
The deployment script automatically extracts the needed parameters for each CloudFormation template based on the parameter names defined in the templates.
|
||||
|
||||
## Deployment Order
|
||||
|
||||
The deployment follows this order:
|
||||
|
||||
1. Infrastructure stacks:
|
||||
- EFS
|
||||
- Cluster
|
||||
- ACM
|
||||
|
||||
2. Service stacks:
|
||||
- Postgres
|
||||
- Redis
|
||||
- Vespa Engine
|
||||
- Model Server (Indexing)
|
||||
- Model Server (Inference)
|
||||
- Backend API Server
|
||||
- Backend Background Server
|
||||
- Web Server
|
||||
- Nginx
|
||||
|
||||
## Usage
|
||||
|
||||
To deploy:
|
||||
```bash
|
||||
./deploy.sh
|
||||
```
|
||||
|
||||
To uninstall:
|
||||
```bash
|
||||
./uninstall.sh
|
||||
```
|
||||
@@ -1,194 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Function to remove comments from JSON and output valid JSON
|
||||
remove_comments() {
|
||||
sed 's/\/\/.*$//' "$1" | grep -v '^[[:space:]]*$'
|
||||
}
|
||||
|
||||
# Variables
|
||||
TEMPLATE_DIR="$(pwd)"
|
||||
SERVICE_DIR="$TEMPLATE_DIR/services"
|
||||
|
||||
# Unified config file
|
||||
CONFIG_FILE="onyx_config.jsonl"
|
||||
|
||||
# Try to get AWS_REGION from config, fallback to default if not found
|
||||
AWS_REGION_FROM_CONFIG=$(remove_comments "$CONFIG_FILE" | jq -r '.AWSRegion // empty')
|
||||
if [ -n "$AWS_REGION_FROM_CONFIG" ]; then
|
||||
AWS_REGION="$AWS_REGION_FROM_CONFIG"
|
||||
else
|
||||
AWS_REGION="${AWS_REGION:-us-east-2}"
|
||||
fi
|
||||
|
||||
# Get environment from config file
|
||||
ENVIRONMENT=$(remove_comments "$CONFIG_FILE" | jq -r '.Environment')
|
||||
if [ -z "$ENVIRONMENT" ] || [ "$ENVIRONMENT" == "null" ]; then
|
||||
echo "Missing Environment in $CONFIG_FILE. Please add the Environment field."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Try to get S3_BUCKET from config, fallback to default if not found
|
||||
S3_BUCKET_FROM_CONFIG=$(remove_comments "$CONFIG_FILE" | jq -r '.S3Bucket // empty')
|
||||
if [ -n "$S3_BUCKET_FROM_CONFIG" ]; then
|
||||
S3_BUCKET="$S3_BUCKET_FROM_CONFIG"
|
||||
else
|
||||
S3_BUCKET="${S3_BUCKET:-onyx-ecs-fargate-configs}"
|
||||
fi
|
||||
|
||||
INFRA_ORDER=(
|
||||
"onyx_efs_template.yaml"
|
||||
"onyx_cluster_template.yaml"
|
||||
"onyx_acm_template.yaml"
|
||||
)
|
||||
|
||||
# Deployment order for services
|
||||
SERVICE_ORDER=(
|
||||
"onyx_postgres_service_template.yaml"
|
||||
"onyx_redis_service_template.yaml"
|
||||
"onyx_vespaengine_service_template.yaml"
|
||||
"onyx_model_server_indexing_service_template.yaml"
|
||||
"onyx_model_server_inference_service_template.yaml"
|
||||
"onyx_backend_api_server_service_template.yaml"
|
||||
"onyx_backend_background_server_service_template.yaml"
|
||||
"onyx_web_server_service_template.yaml"
|
||||
"onyx_nginx_service_template.yaml"
|
||||
)
|
||||
|
||||
# Function to validate a CloudFormation template
|
||||
validate_template() {
|
||||
local template_file=$1
|
||||
echo "Validating template: $template_file..."
|
||||
aws cloudformation validate-template --template-body file://"$template_file" --region "$AWS_REGION" > /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Validation failed for $template_file. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
echo "Validation succeeded for $template_file."
|
||||
}
|
||||
|
||||
# Function to create CloudFormation parameters from JSON
|
||||
create_parameters_from_json() {
|
||||
local template_file=$1
|
||||
local temp_params_file="${template_file%.yaml}_parameters.json"
|
||||
|
||||
# Convert the config file contents to CloudFormation parameter format
|
||||
echo "[" > "$temp_params_file"
|
||||
|
||||
# Process all key-value pairs from the config file
|
||||
local first=true
|
||||
remove_comments "$CONFIG_FILE" | jq -r 'to_entries[] | select(.value != null and .value != "") | "\(.key)|\(.value)"' | while IFS='|' read -r key value; do
|
||||
if [ "$first" = true ]; then
|
||||
first=false
|
||||
else
|
||||
echo "," >> "$temp_params_file"
|
||||
fi
|
||||
echo " {\"ParameterKey\": \"$key\", \"ParameterValue\": \"$value\"}" >> "$temp_params_file"
|
||||
done
|
||||
|
||||
echo "]" >> "$temp_params_file"
|
||||
|
||||
# Debug output - display the created parameters file
|
||||
echo "Generated parameters file: $temp_params_file" >&2
|
||||
echo "Contents:" >&2
|
||||
cat "$temp_params_file" >&2
|
||||
|
||||
# Return just the filename
|
||||
echo "$temp_params_file"
|
||||
}
|
||||
|
||||
# Function to deploy a CloudFormation stack
|
||||
deploy_stack() {
|
||||
local stack_name=$1
|
||||
local template_file=$2
|
||||
|
||||
echo "Checking if stack $stack_name exists..."
|
||||
if aws cloudformation describe-stacks --stack-name "$stack_name" --region "$AWS_REGION" > /dev/null 2>&1; then
|
||||
echo "Stack $stack_name already exists. Skipping deployment."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Create temporary parameters file for this template
|
||||
local temp_params_file=$(create_parameters_from_json "$template_file")
|
||||
|
||||
# Special handling for SubnetIDs parameter if needed
|
||||
if grep -q "SubnetIDs" "$template_file"; then
|
||||
echo "Template uses SubnetIDs parameter, ensuring it's properly formatted..."
|
||||
# Make sure we're passing SubnetIDs as a comma-separated list
|
||||
local subnet_ids=$(remove_comments "$CONFIG_FILE" | jq -r '.SubnetIDs // empty')
|
||||
if [ -n "$subnet_ids" ]; then
|
||||
echo "Using SubnetIDs from config: $subnet_ids"
|
||||
else
|
||||
echo "Warning: SubnetIDs not found in config but template requires it."
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Deploying stack: $stack_name with template: $template_file and generated config from: $CONFIG_FILE..."
|
||||
aws cloudformation deploy \
|
||||
--stack-name "$stack_name" \
|
||||
--template-file "$template_file" \
|
||||
--parameter-overrides file://"$temp_params_file" \
|
||||
--capabilities CAPABILITY_IAM CAPABILITY_NAMED_IAM CAPABILITY_AUTO_EXPAND \
|
||||
--region "$AWS_REGION" \
|
||||
--no-cli-auto-prompt > /dev/null
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Deployment failed for $stack_name. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean up temporary parameter file
|
||||
rm "$temp_params_file"
|
||||
|
||||
echo "Stack deployed successfully: $stack_name."
|
||||
}
|
||||
|
||||
convert_underscores_to_hyphens() {
|
||||
local input_string="$1"
|
||||
local converted_string="${input_string//_/-}"
|
||||
echo "$converted_string"
|
||||
}
|
||||
|
||||
deploy_infra_stacks() {
|
||||
for template_name in "${INFRA_ORDER[@]}"; do
|
||||
# Skip ACM template if HostedZoneId is not set
|
||||
if [[ "$template_name" == "onyx_acm_template.yaml" ]]; then
|
||||
HOSTED_ZONE_ID=$(remove_comments "$CONFIG_FILE" | jq -r '.HostedZoneId')
|
||||
if [ -z "$HOSTED_ZONE_ID" ] || [ "$HOSTED_ZONE_ID" == "" ] || [ "$HOSTED_ZONE_ID" == "null" ]; then
|
||||
echo "Skipping ACM template deployment because HostedZoneId is not set in $CONFIG_FILE"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
template_file="$template_name"
|
||||
stack_name="$ENVIRONMENT-$(basename "$template_name" _template.yaml)"
|
||||
stack_name=$(convert_underscores_to_hyphens "$stack_name")
|
||||
|
||||
if [ -f "$template_file" ]; then
|
||||
validate_template "$template_file"
|
||||
deploy_stack "$stack_name" "$template_file"
|
||||
else
|
||||
echo "Warning: Template file $template_file not found. Skipping."
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
deploy_services_stacks() {
|
||||
for template_name in "${SERVICE_ORDER[@]}"; do
|
||||
template_file="$SERVICE_DIR/$template_name"
|
||||
stack_name="$ENVIRONMENT-$(basename "$template_name" _template.yaml)"
|
||||
stack_name=$(convert_underscores_to_hyphens "$stack_name")
|
||||
|
||||
if [ -f "$template_file" ]; then
|
||||
validate_template "$template_file"
|
||||
deploy_stack "$stack_name" "$template_file"
|
||||
else
|
||||
echo "Warning: Template file $template_file not found. Skipping."
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
echo "Starting deployment of Onyx to ECS Fargate Cluster..."
|
||||
deploy_infra_stacks
|
||||
deploy_services_stacks
|
||||
|
||||
echo "All templates validated and deployed successfully."
|
||||
@@ -1,31 +0,0 @@
|
||||
AWSTemplateFormatVersion: '2010-09-09'
|
||||
Description: CloudFormation template to create an ACM Certificate.
|
||||
|
||||
Parameters:
|
||||
DomainName:
|
||||
Type: String
|
||||
Description: The primary domain name for the certificate (e.g., example.com).
|
||||
Default: example.com
|
||||
Environment:
|
||||
Type: String
|
||||
Default: production
|
||||
ValidationMethod:
|
||||
Type: String
|
||||
Default: DNS
|
||||
|
||||
Resources:
|
||||
Certificate:
|
||||
Type: AWS::CertificateManager::Certificate
|
||||
Properties:
|
||||
DomainName: !Ref DomainName
|
||||
ValidationMethod: !Ref ValidationMethod
|
||||
Tags:
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
|
||||
Outputs:
|
||||
OutputAcm:
|
||||
Description: ACM Cert Id
|
||||
Value: !Ref Certificate
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-OnyxCertificate
|
||||
@@ -1,156 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: The template used to create an ECS Cluster from the ECS Console.
|
||||
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
Description: The environment that is used in the name of the cluster as well.
|
||||
OnyxNamespace:
|
||||
Type: String
|
||||
Default: onyx
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
|
||||
Resources:
|
||||
ECSCluster:
|
||||
Type: AWS::ECS::Cluster
|
||||
Properties:
|
||||
ClusterName: !Sub ${Environment}-onyx-cluster
|
||||
CapacityProviders:
|
||||
- FARGATE
|
||||
- FARGATE_SPOT
|
||||
ClusterSettings:
|
||||
- Name: containerInsights
|
||||
Value: enhanced
|
||||
ServiceConnectDefaults:
|
||||
Namespace: !Sub ${Environment}-onyx-cluster
|
||||
Tags:
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
- Key: app
|
||||
Value: onyx
|
||||
|
||||
S3Bucket:
|
||||
Type: AWS::S3::Bucket
|
||||
Properties:
|
||||
BucketName: !Sub ${Environment}-onyx-ecs-fargate-configs
|
||||
AccessControl: Private
|
||||
BucketEncryption:
|
||||
ServerSideEncryptionConfiguration:
|
||||
- ServerSideEncryptionByDefault:
|
||||
SSEAlgorithm: AES256
|
||||
PublicAccessBlockConfiguration:
|
||||
BlockPublicAcls: true
|
||||
BlockPublicPolicy: true
|
||||
IgnorePublicAcls: true
|
||||
RestrictPublicBuckets: true
|
||||
|
||||
PrivateDnsNamespace:
|
||||
Type: AWS::ServiceDiscovery::PrivateDnsNamespace
|
||||
Properties:
|
||||
Description: AWS Cloud Map private DNS namespace for resources for onyx website.
|
||||
Vpc: !Ref VpcID
|
||||
Name: !Ref OnyxNamespace
|
||||
Properties:
|
||||
DnsProperties:
|
||||
SOA:
|
||||
TTL: 50
|
||||
|
||||
ECSTaskRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
RoleName: !Sub ${Environment}-OnyxEcsTaskRole
|
||||
AssumeRolePolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: ecs-tasks.amazonaws.com
|
||||
Action: sts:AssumeRole
|
||||
Policies:
|
||||
- PolicyName: "EFSPolicy"
|
||||
PolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Sid: "VisualEditor0"
|
||||
Effect: Allow
|
||||
Action:
|
||||
- "elasticfilesystem:*"
|
||||
Resource:
|
||||
- !Sub "arn:aws:elasticfilesystem:*:${AWS::AccountId}:access-point/*"
|
||||
- !Sub "arn:aws:elasticfilesystem:*:${AWS::AccountId}:file-system/*"
|
||||
- Sid: "VisualEditor1"
|
||||
Effect: Allow
|
||||
Action: "elasticfilesystem:*"
|
||||
Resource: "*"
|
||||
- PolicyName: "S3Policy"
|
||||
PolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Sid: "VisualEditor0"
|
||||
Effect: Allow
|
||||
Action:
|
||||
- "s3:GetObject"
|
||||
- "s3:ListBucket"
|
||||
Resource:
|
||||
- !Sub "arn:aws:s3:::${Environment}-onyx-ecs-fargate-configs/*"
|
||||
- !Sub "arn:aws:s3:::${Environment}-onyx-ecs-fargate-configs"
|
||||
|
||||
ECSTaskExecutionRole:
|
||||
Type: AWS::IAM::Role
|
||||
Properties:
|
||||
RoleName: !Sub ${Environment}-OnyxECSTaskExecutionRole
|
||||
AssumeRolePolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Principal:
|
||||
Service: ecs-tasks.amazonaws.com
|
||||
Action: sts:AssumeRole
|
||||
ManagedPolicyArns:
|
||||
- arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
|
||||
Policies:
|
||||
- PolicyName: "CloudWatchLogsPolicy"
|
||||
PolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Sid: "VisualEditor0"
|
||||
Effect: Allow
|
||||
Action: "logs:CreateLogGroup"
|
||||
Resource: !Sub "arn:aws:logs:*:${AWS::AccountId}:log-group:*"
|
||||
- PolicyName: "SecretsManagerPolicy"
|
||||
PolicyDocument:
|
||||
Version: "2012-10-17"
|
||||
Statement:
|
||||
- Effect: Allow
|
||||
Action:
|
||||
- secretsmanager:GetSecretValue
|
||||
Resource: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password-*
|
||||
|
||||
Outputs:
|
||||
OutputEcsCluster:
|
||||
Description: Onyx ECS Cluster
|
||||
Value: !Ref ECSCluster
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-ECSClusterName
|
||||
OutputECSTaskRole:
|
||||
Description: Onyx ECS Task Role
|
||||
Value: !Ref ECSTaskRole
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-ECSTaskRole
|
||||
OutputECSTaskExecutionRole:
|
||||
Description: Onyx ECS TaskExecutionRole
|
||||
Value: !Ref ECSTaskExecutionRole
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-ECSTaskExecutionRole
|
||||
OutputOnyxNamespace:
|
||||
Description: Onyx CloudMap namespace ID for ECS service discvoery.
|
||||
Value: !Ref PrivateDnsNamespace
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-OnyxNamespace
|
||||
OutputOnyxNamespaceName:
|
||||
Description: Onyx CloudMap namespace domain name for ECS service discvoery.
|
||||
Value: !Ref OnyxNamespace
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-OnyxNamespaceName
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
// Naming, likely doesn't need to be changed
|
||||
"OnyxNamespace": "onyx",
|
||||
"Environment": "production",
|
||||
"EFSName": "onyx-efs",
|
||||
|
||||
// Region and VPC Stuff
|
||||
"AWSRegion": "us-east-2",
|
||||
"VpcID": "YOUR_VPC_ID",
|
||||
"SubnetIDs": "YOUR_SUBNET_ID1,YOUR_SUBNET_ID2",
|
||||
|
||||
// Domain and ACM Stuff
|
||||
"DomainName": "YOUR_DOMAIN e.g ecs.onyx.app",
|
||||
"ValidationMethod": "DNS",
|
||||
"HostedZoneId": "" // Only specify if using Route 53 for DNS
|
||||
}
|
||||
@@ -1,128 +0,0 @@
|
||||
Parameters:
|
||||
|
||||
EFSName:
|
||||
Type: String
|
||||
Default: onyx-efs
|
||||
Environment:
|
||||
Type: String
|
||||
Default: production
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-0f230ca52bb04c722
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
|
||||
Resources:
|
||||
|
||||
OnyxEfs:
|
||||
Type: AWS::EFS::FileSystem
|
||||
Properties:
|
||||
BackupPolicy:
|
||||
Status: ENABLED
|
||||
Encrypted: True
|
||||
PerformanceMode: generalPurpose
|
||||
FileSystemTags:
|
||||
- Key: Name
|
||||
Value: !Sub ${Environment}-${EFSName}-${AWS::Region}-${AWS::AccountId}
|
||||
FileSystemProtection:
|
||||
ReplicationOverwriteProtection: ENABLED
|
||||
ThroughputMode: elastic
|
||||
|
||||
VespaEngineTmpEfsAccessPoint:
|
||||
Type: AWS::EFS::AccessPoint
|
||||
Properties:
|
||||
AccessPointTags:
|
||||
- Key: Name
|
||||
Value: vespaengine-tmp
|
||||
FileSystemId: !Ref OnyxEfs
|
||||
RootDirectory:
|
||||
CreationInfo:
|
||||
OwnerGid: "1000"
|
||||
OwnerUid: "1000"
|
||||
Permissions: "0755"
|
||||
Path: /var/tmp
|
||||
|
||||
VespaEngineDataEfsAccessPoint:
|
||||
Type: AWS::EFS::AccessPoint
|
||||
Properties:
|
||||
AccessPointTags:
|
||||
- Key: Name
|
||||
Value: vespaengine-data
|
||||
FileSystemId: !Ref OnyxEfs
|
||||
RootDirectory:
|
||||
CreationInfo:
|
||||
OwnerGid: "1000"
|
||||
OwnerUid: "1000"
|
||||
Permissions: "0755"
|
||||
Path: /opt/vespa/var
|
||||
|
||||
PostgresDataEfsAccessPoint:
|
||||
Type: AWS::EFS::AccessPoint
|
||||
Properties:
|
||||
AccessPointTags:
|
||||
- Key: Name
|
||||
Value: postgres-data
|
||||
FileSystemId: !Ref OnyxEfs
|
||||
RootDirectory:
|
||||
CreationInfo:
|
||||
OwnerGid: "1000"
|
||||
OwnerUid: "1000"
|
||||
Permissions: "0755"
|
||||
Path: /var/lib/postgresql/data
|
||||
|
||||
EFSMountTarget1:
|
||||
DependsOn: OnyxEfs
|
||||
Type: AWS::EFS::MountTarget
|
||||
Properties:
|
||||
FileSystemId: !Ref OnyxEfs
|
||||
SubnetId: !Select [0, !Ref SubnetIDs]
|
||||
SecurityGroups:
|
||||
- !Ref EFSSecurityGroupMountTargets
|
||||
|
||||
EFSMountTarget2:
|
||||
DependsOn: OnyxEfs
|
||||
Type: AWS::EFS::MountTarget
|
||||
Properties:
|
||||
FileSystemId: !Ref OnyxEfs
|
||||
SubnetId: !Select [1, !Ref SubnetIDs]
|
||||
SecurityGroups:
|
||||
- !Ref EFSSecurityGroupMountTargets
|
||||
|
||||
EFSSecurityGroupMountTargets:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: Security Group for EFS Mount Targets
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- IpProtocol: tcp
|
||||
FromPort: 2049
|
||||
ToPort: 2049
|
||||
CidrIp: 0.0.0.0/0
|
||||
|
||||
Outputs:
|
||||
OutputOnyxEfsId:
|
||||
Description: Onyx Filesystem Id
|
||||
Value: !Ref OnyxEfs
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-OnyxEfsId
|
||||
OutputVespaEngineTmpEfsAccessPoint:
|
||||
Description: VespaEngine Tmp AP
|
||||
Value: !Ref VespaEngineTmpEfsAccessPoint
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-VespaEngineTmpEfsAccessPoint
|
||||
OutputVespaEngineDataEfsAccessPoint:
|
||||
Description: VespaEngine Data Ap
|
||||
Value: !Ref VespaEngineDataEfsAccessPoint
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-VespaEngineDataEfsAccessPoint
|
||||
OutputPostgresDataEfsAccessPoint:
|
||||
Description: Postgres Data AP
|
||||
Value: !Ref PostgresDataEfsAccessPoint
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-PostgresDataEfsAccessPoint
|
||||
OutputEFSSecurityGroupMountTargets:
|
||||
Description: EFS Security Group
|
||||
Value: !Ref EFSSecurityGroupMountTargets
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-EFSSecurityGroupMountTargets
|
||||
@@ -1,216 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Backend Api Server TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-backend-api-server
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "4096"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 8080
|
||||
ToPort: 8080
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 8080
|
||||
ToPort: 8080
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: onyx-backend
|
||||
Image: onyxdotapp/onyx-backend:latest
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
Command:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
- |
|
||||
alembic upgrade head && echo "Starting Onyx Api Server" && uvicorn onyx.main:app --host 0.0.0.0 --port 8080
|
||||
PortMappings:
|
||||
- Name: backend
|
||||
ContainerPort: 8080
|
||||
HostPort: 8080
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
Environment:
|
||||
- Name: REDIS_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-redis-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: MODEL_SERVER_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-model-server-inference-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: VESPA_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-vespaengine-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: POSTGRES_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-postgres-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: INDEXING_MODEL_SERVER_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-model-server-indexing-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: AUTH_TYPE
|
||||
Value: disabled
|
||||
Secrets:
|
||||
- Name: POSTGRES_PASSWORD
|
||||
ValueFrom: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password
|
||||
VolumesFrom: []
|
||||
SystemControls: []
|
||||
|
||||
ECSAutoScalingTarget:
|
||||
Type: AWS::ApplicationAutoScaling::ScalableTarget
|
||||
DependsOn: ECSService
|
||||
Properties:
|
||||
MaxCapacity: 5
|
||||
MinCapacity: 1
|
||||
ResourceId: !Sub
|
||||
- "service/${ImportedCluster}/${Environment}-${ServiceName}-service"
|
||||
- ImportedCluster: !ImportValue
|
||||
'Fn::Sub': "${Environment}-onyx-cluster-ECSClusterName"
|
||||
ServiceName: !Ref ServiceName
|
||||
Environment: !Ref Environment
|
||||
ScalableDimension: ecs:service:DesiredCount
|
||||
ServiceNamespace: ecs
|
||||
|
||||
ECSAutoScalingPolicy:
|
||||
Type: AWS::ApplicationAutoScaling::ScalingPolicy
|
||||
Properties:
|
||||
PolicyName: !Sub ${Environment}-${ServiceName}-service-cpu-scaleout
|
||||
ScalingTargetId: !Ref ECSAutoScalingTarget
|
||||
PolicyType: TargetTrackingScaling
|
||||
TargetTrackingScalingPolicyConfiguration:
|
||||
TargetValue: 75
|
||||
PredefinedMetricSpecification:
|
||||
PredefinedMetricType: ECSServiceAverageCPUUtilization
|
||||
ScaleOutCooldown: 60
|
||||
ScaleInCooldown: 60
|
||||
|
||||
ECSAutoScalingPolicyMemory:
|
||||
Type: AWS::ApplicationAutoScaling::ScalingPolicy
|
||||
Properties:
|
||||
PolicyName: !Sub ${Environment}-${ServiceName}-service-mem-scaleout
|
||||
ScalingTargetId: !Ref ECSAutoScalingTarget
|
||||
PolicyType: TargetTrackingScaling
|
||||
TargetTrackingScalingPolicyConfiguration:
|
||||
TargetValue: 80
|
||||
PredefinedMetricSpecification:
|
||||
PredefinedMetricType: ECSServiceAverageMemoryUtilization
|
||||
ScaleOutCooldown: 60
|
||||
ScaleInCooldown: 60
|
||||
@@ -1,174 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Backend Background Server TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-backend-background-server
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "4096"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 8080
|
||||
ToPort: 8080
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 8080
|
||||
ToPort: 8080
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: onyx-backend-background
|
||||
Image: onyxdotapp/onyx-backend:latest
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
Command:
|
||||
- "/usr/bin/supervisord"
|
||||
- "-c"
|
||||
- "/etc/supervisor/conf.d/supervisord.conf"
|
||||
PortMappings:
|
||||
- Name: backend
|
||||
ContainerPort: 8080
|
||||
HostPort: 8080
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
Environment:
|
||||
- Name: REDIS_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-redis-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: MODEL_SERVER_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-model-server-inference-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: VESPA_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-vespaengine-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: POSTGRES_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-postgres-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: INDEXING_MODEL_SERVER_HOST
|
||||
Value: !Sub
|
||||
- "${Environment}-onyx-model-server-indexing-service.${ImportedNamespace}"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
- Name: AUTH_TYPE
|
||||
Value: disabled
|
||||
Secrets:
|
||||
- Name: POSTGRES_PASSWORD
|
||||
ValueFrom: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password
|
||||
VolumesFrom: []
|
||||
SystemControls: []
|
||||
@@ -1,163 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Model Server Indexing TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-model-server-indexing
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "4096"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 9000
|
||||
ToPort: 9000
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 9000
|
||||
ToPort: 9000
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: onyx-model-server-indexing
|
||||
Image: onyxdotapp/onyx-model-server:latest
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
Command:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
- >
|
||||
if [ "${DISABLE_MODEL_SERVER:-false}" = "True" ]; then echo 'Skipping service...';
|
||||
exit 0; else exec uvicorn model_server.main:app --host 0.0.0.0 --port 9000; fi
|
||||
PortMappings:
|
||||
- Name: model-server
|
||||
ContainerPort: 9000
|
||||
HostPort: 9000
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
Environment:
|
||||
- Name: LOG_LEVEL
|
||||
Value: info
|
||||
- Name: INDEXING_ONLY
|
||||
Value: True
|
||||
- Name: VESPA_SEARCHER_THREADS
|
||||
Value: "1"
|
||||
MountPoints:
|
||||
- SourceVolume: efs-volume
|
||||
ContainerPath: /root/.cache/huggingface/
|
||||
ReadOnly: false
|
||||
VolumesFrom: []
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: "ecs"
|
||||
SystemControls: []
|
||||
Volumes:
|
||||
- Name: efs-volume
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
|
||||
RootDirectory: "/"
|
||||
@@ -1,200 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Model Server Inference TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-model-server-inference
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "4096"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 9000
|
||||
ToPort: 9000
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 9000
|
||||
ToPort: 9000
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: onyx-model-server-inference
|
||||
Image: onyxdotapp/onyx-model-server:latest
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
Command:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
- >
|
||||
if [ "${DISABLE_MODEL_SERVER:-false}" = "True" ]; then echo 'Skipping service...';
|
||||
exit 0; else exec uvicorn model_server.main:app --host 0.0.0.0 --port 9000; fi
|
||||
PortMappings:
|
||||
- Name: model-server
|
||||
ContainerPort: 9000
|
||||
HostPort: 9000
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
Environment:
|
||||
- Name: LOG_LEVEL
|
||||
Value: info
|
||||
MountPoints:
|
||||
- SourceVolume: efs-volume
|
||||
ContainerPath: /root/.cache/huggingface/
|
||||
ReadOnly: false
|
||||
VolumesFrom: []
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: "ecs"
|
||||
SystemControls: []
|
||||
Volumes:
|
||||
- Name: efs-volume
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
|
||||
RootDirectory: "/"
|
||||
|
||||
ECSAutoScalingTarget:
|
||||
Type: AWS::ApplicationAutoScaling::ScalableTarget
|
||||
DependsOn: ECSService
|
||||
Properties:
|
||||
MaxCapacity: 5
|
||||
MinCapacity: 1
|
||||
ResourceId: !Sub
|
||||
- "service/${ImportedCluster}/${Environment}-${ServiceName}-service"
|
||||
- ImportedCluster: !ImportValue
|
||||
'Fn::Sub': "${Environment}-onyx-cluster-ECSClusterName"
|
||||
ServiceName: !Ref ServiceName
|
||||
Environment: !Ref Environment
|
||||
ScalableDimension: ecs:service:DesiredCount
|
||||
ServiceNamespace: ecs
|
||||
|
||||
ECSAutoScalingPolicy:
|
||||
Type: AWS::ApplicationAutoScaling::ScalingPolicy
|
||||
Properties:
|
||||
PolicyName: !Sub ${Environment}-${ServiceName}-service-cpu-scaleout
|
||||
ScalingTargetId: !Ref ECSAutoScalingTarget
|
||||
PolicyType: TargetTrackingScaling
|
||||
TargetTrackingScalingPolicyConfiguration:
|
||||
TargetValue: 75
|
||||
PredefinedMetricSpecification:
|
||||
PredefinedMetricType: ECSServiceAverageCPUUtilization
|
||||
ScaleOutCooldown: 60
|
||||
ScaleInCooldown: 60
|
||||
|
||||
ECSAutoScalingPolicyMemory:
|
||||
Type: AWS::ApplicationAutoScaling::ScalingPolicy
|
||||
Properties:
|
||||
PolicyName: !Sub ${Environment}-${ServiceName}-service-memory-scaleout
|
||||
ScalingTargetId: !Ref ECSAutoScalingTarget
|
||||
PolicyType: TargetTrackingScaling
|
||||
TargetTrackingScalingPolicyConfiguration:
|
||||
TargetValue: 80
|
||||
PredefinedMetricSpecification:
|
||||
PredefinedMetricType: ECSServiceAverageMemoryUtilization
|
||||
ScaleOutCooldown: 60
|
||||
ScaleInCooldown: 60
|
||||
@@ -1,288 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: "The template used to create an ECS Service from the ECS Console."
|
||||
|
||||
Parameters:
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
HostedZoneId:
|
||||
Type: String
|
||||
Default: ''
|
||||
DomainName:
|
||||
Type: String
|
||||
Default: demo.danswer.ai
|
||||
Environment:
|
||||
Type: String
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-nginx
|
||||
OnyxNamespace:
|
||||
Type: String
|
||||
Default: onyx
|
||||
OnyxBackendApiServiceName:
|
||||
Type: String
|
||||
Default: onyx-backend-api-server-service
|
||||
OnyxWebServerServiceName:
|
||||
Type: String
|
||||
Default: onyx-web-server-service
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "512"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "1024"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
GitHubConfigUrl:
|
||||
Type: String
|
||||
Default: "https://raw.githubusercontent.com/onyx-dot-app/onyx/main/deployment/data/nginx/app.conf.template.dev"
|
||||
Description: "URL to the nginx configuration file on GitHub"
|
||||
GitHubRunScriptUrl:
|
||||
Type: String
|
||||
Default: "https://raw.githubusercontent.com/onyx-dot-app/onyx/main/deployment/data/nginx/run-nginx.sh"
|
||||
Description: "URL to the nginx run script on GitHub"
|
||||
|
||||
Conditions:
|
||||
CreateRoute53: !Not
|
||||
- !Equals
|
||||
- !Ref HostedZoneId
|
||||
- ''
|
||||
|
||||
Resources:
|
||||
ECSService:
|
||||
Type: "AWS::ECS::Service"
|
||||
DependsOn: LoadBalancer
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: "FARGATE"
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}
|
||||
SchedulingStrategy: "REPLICA"
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: "ENABLED"
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: "ENABLED"
|
||||
SecurityGroups:
|
||||
- !Ref SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: "LATEST"
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: "ECS"
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt
|
||||
- "ServiceDiscoveryService"
|
||||
- "Arn"
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
LoadBalancers:
|
||||
- ContainerName: nginx
|
||||
ContainerPort: 80
|
||||
TargetGroupArn: !Ref TargetGroup
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
ContainerDefinitions:
|
||||
- Name: nginx
|
||||
Image: nginx:1.23.4-alpine
|
||||
Cpu: 0
|
||||
PortMappings:
|
||||
- Name: nginx-80-tcp
|
||||
ContainerPort: 80
|
||||
HostPort: 80
|
||||
Protocol: tcp
|
||||
Essential: true
|
||||
Command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- dos2unix /etc/nginx/conf.d/run-nginx.sh && /etc/nginx/conf.d/run-nginx.sh app.conf.template.dev
|
||||
Environment:
|
||||
- Name: EMAIL
|
||||
Value: ""
|
||||
- Name: DOMAIN
|
||||
Value: !Ref DomainName
|
||||
- Name: ONYX_BACKEND_API_HOST
|
||||
Value: !Sub ${Environment}-${OnyxBackendApiServiceName}.${OnyxNamespace}
|
||||
- Name: ONYX_WEB_SERVER_HOST
|
||||
Value: !Sub ${Environment}-${OnyxWebServerServiceName}.${OnyxNamespace}
|
||||
MountPoints:
|
||||
- SourceVolume: efs-volume
|
||||
ContainerPath: /etc/nginx/conf.d
|
||||
VolumesFrom: []
|
||||
DependsOn:
|
||||
- ContainerName: github-sync-container
|
||||
Condition: SUCCESS
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-OnyxNginxTaskDefinition
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: 25m
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
SystemControls: []
|
||||
- Name: github-sync-container
|
||||
Image: curlimages/curl:latest
|
||||
Cpu: 128
|
||||
MemoryReservation: 256
|
||||
PortMappings: []
|
||||
Essential: false
|
||||
Command:
|
||||
- sh
|
||||
- -c
|
||||
- !Sub |
|
||||
curl -L ${GitHubConfigUrl} -o /etc/nginx/conf.d/app.conf.template.dev &&
|
||||
curl -L ${GitHubRunScriptUrl} -o /etc/nginx/conf.d/run-nginx.sh &&
|
||||
chmod 644 /etc/nginx/conf.d/app.conf.template.dev &&
|
||||
chmod 755 /etc/nginx/conf.d/run-nginx.sh &&
|
||||
exit 0 || exit 1
|
||||
MountPoints:
|
||||
- SourceVolume: efs-volume
|
||||
ContainerPath: /etc/nginx/conf.d
|
||||
VolumesFrom: []
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-github-sync-configs-TaskDefinition
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: 25m
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
SystemControls: []
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
Volumes:
|
||||
- Name: efs-volume
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
|
||||
RootDirectory: /
|
||||
PlacementConstraints: []
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
EnableFaultInjection: false
|
||||
|
||||
SecurityGroup:
|
||||
Type: "AWS::EC2::SecurityGroup"
|
||||
Properties:
|
||||
GroupDescription: !Sub "Security group for ${ServiceName}"
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 80
|
||||
ToPort: 80
|
||||
IpProtocol: "tcp"
|
||||
CidrIp: "0.0.0.0/0"
|
||||
- FromPort: 80
|
||||
ToPort: 80
|
||||
IpProtocol: "tcp"
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Ref ServiceName
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
LoadBalancer:
|
||||
Type: AWS::ElasticLoadBalancingV2::LoadBalancer
|
||||
DependsOn: SecurityGroup
|
||||
Properties:
|
||||
Type: application
|
||||
Scheme: internet-facing
|
||||
Subnets: !Ref SubnetIDs
|
||||
SecurityGroups:
|
||||
- !Ref SecurityGroup
|
||||
|
||||
LoadBalancerListener:
|
||||
Type: AWS::ElasticLoadBalancingV2::Listener
|
||||
Properties:
|
||||
LoadBalancerArn: !Ref LoadBalancer
|
||||
Port: 80
|
||||
Protocol: HTTP
|
||||
DefaultActions:
|
||||
- Type: forward
|
||||
TargetGroupArn: !Ref TargetGroup
|
||||
|
||||
TargetGroup:
|
||||
Type: AWS::ElasticLoadBalancingV2::TargetGroup
|
||||
Properties:
|
||||
HealthCheckEnabled: True
|
||||
HealthCheckIntervalSeconds: 30
|
||||
HealthCheckPort: 80
|
||||
HealthCheckPath: /api/health
|
||||
HealthCheckProtocol: HTTP
|
||||
HealthCheckTimeoutSeconds: 20
|
||||
HealthyThresholdCount: 3
|
||||
Port: 80
|
||||
Protocol: HTTP
|
||||
ProtocolVersion: HTTP1
|
||||
VpcId: !Ref VpcID
|
||||
TargetType: ip
|
||||
|
||||
Route53Record:
|
||||
Type: AWS::Route53::RecordSet
|
||||
Condition: CreateRoute53
|
||||
Properties:
|
||||
HostedZoneId: !Ref HostedZoneId
|
||||
Name: !Ref DomainName
|
||||
Type: A
|
||||
AliasTarget:
|
||||
DNSName: !GetAtt LoadBalancer.DNSName
|
||||
HostedZoneId: !GetAtt LoadBalancer.CanonicalHostedZoneID
|
||||
EvaluateTargetHealth: false
|
||||
|
||||
Outputs:
|
||||
ECSService:
|
||||
Description: "The created service."
|
||||
Value: !Ref "ECSService"
|
||||
ServiceDiscoveryService:
|
||||
Value: !Ref "ServiceDiscoveryService"
|
||||
OutputOnyxLoadBalancerDNSName:
|
||||
Description: LoadBalancer DNSName
|
||||
Value: !GetAtt LoadBalancer.DNSName
|
||||
Export:
|
||||
Name: !Sub ${AWS::StackName}-OnyxLoadBalancerDNSName
|
||||
@@ -1,177 +0,0 @@
|
||||
AWSTemplateFormatVersion: '2010-09-09'
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
Default: production
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-postgres
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "1024"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: DISABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- !Ref SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 100
|
||||
MinimumHealthyPercent: 0
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 5432
|
||||
ToPort: 5432
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 5432
|
||||
ToPort: 5432
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
- FromPort: 2049
|
||||
ToPort: 2049
|
||||
IpProtocol: tcp
|
||||
SourceSecurityGroupId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-EFSSecurityGroupMountTargets"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
Volumes:
|
||||
- Name: efs-volume-data
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
|
||||
RootDirectory: "/"
|
||||
TransitEncryption: ENABLED
|
||||
AuthorizationConfig:
|
||||
AccessPointId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-PostgresDataEfsAccessPoint"
|
||||
ContainerDefinitions:
|
||||
- Name: !Ref ServiceName
|
||||
Image: postgres:15.2-alpine
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
StopTimeout: 30
|
||||
Command:
|
||||
- "-c"
|
||||
- "max_connections=250"
|
||||
PortMappings:
|
||||
- Name: postgres
|
||||
ContainerPort: 5432
|
||||
HostPort: 5432
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
Environment:
|
||||
- Name: POSTGRES_USER
|
||||
Value: postgres
|
||||
- Name: PGSSLMODE
|
||||
Value: require
|
||||
- Name: POSTGRES_DB
|
||||
Value: postgres
|
||||
Secrets:
|
||||
- Name: POSTGRES_PASSWORD
|
||||
ValueFrom: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password
|
||||
MountPoints:
|
||||
- SourceVolume: efs-volume-data
|
||||
ContainerPath: /var/lib/postgresql/data
|
||||
ReadOnly: false
|
||||
- SourceVolume: efs-volume-data
|
||||
ContainerPath: /var/lib/postgresql
|
||||
ReadOnly: false
|
||||
User: "1000"
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: /ecs/OnyxPostgresTaskDefinition
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
@@ -1,146 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Redis TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-redis
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "1024"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 6379
|
||||
ToPort: 6379
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 6379
|
||||
ToPort: 6379
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: redis
|
||||
Image: redis:7.4-alpine
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
Command:
|
||||
- "redis-server"
|
||||
- "--save"
|
||||
- "\"\""
|
||||
- "--appendonly"
|
||||
- "no"
|
||||
PortMappings:
|
||||
- Name: redis_port
|
||||
ContainerPort: 6379
|
||||
HostPort: 6379
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
Environment: []
|
||||
VolumesFrom: []
|
||||
SystemControls: []
|
||||
@@ -1,190 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Vespa Engine TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-vespaengine
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "4096"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "16384"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 19071
|
||||
ToPort: 19071
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 19071
|
||||
ToPort: 19071
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
- FromPort: 8081
|
||||
ToPort: 8081
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 8081
|
||||
ToPort: 8081
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
- FromPort: 2049
|
||||
ToPort: 2049
|
||||
IpProtocol: tcp
|
||||
SourceSecurityGroupId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-EFSSecurityGroupMountTargets"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: vespaengine
|
||||
Image: vespaengine/vespa:8.277.17
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
PortMappings:
|
||||
- Name: vespaengine_port
|
||||
ContainerPort: 19071
|
||||
HostPort: 19071
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
- Name: vespaengine_port2
|
||||
ContainerPort: 8081
|
||||
HostPort: 8081
|
||||
Protocol: tcp
|
||||
AppProtocol: http
|
||||
MountPoints:
|
||||
- SourceVolume: efs-volume-data
|
||||
ContainerPath: /opt/vespa/var
|
||||
ReadOnly: false
|
||||
- SourceVolume: efs-volume-tmp
|
||||
ContainerPath: /var/tmp
|
||||
ReadOnly: false
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: /ecs/OnyxVespaEngineTaskDefinition
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
User: "1000"
|
||||
Environment: []
|
||||
VolumesFrom: []
|
||||
SystemControls: []
|
||||
Volumes:
|
||||
- Name: efs-volume-tmp
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
|
||||
RootDirectory: "/"
|
||||
TransitEncryption: ENABLED
|
||||
AuthorizationConfig:
|
||||
AccessPointId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-VespaEngineTmpEfsAccessPoint"
|
||||
- Name: efs-volume-data
|
||||
EFSVolumeConfiguration:
|
||||
FilesystemId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
|
||||
RootDirectory: "/"
|
||||
TransitEncryption: ENABLED
|
||||
AuthorizationConfig:
|
||||
AccessPointId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-efs-VespaEngineDataEfsAccessPoint"
|
||||
@@ -1,190 +0,0 @@
|
||||
AWSTemplateFormatVersion: "2010-09-09"
|
||||
Description: CloudFormation template for Onyx Web Server TaskDefinition
|
||||
Parameters:
|
||||
Environment:
|
||||
Type: String
|
||||
SubnetIDs:
|
||||
Type: CommaDelimitedList
|
||||
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
|
||||
VpcID:
|
||||
Type: String
|
||||
Default: vpc-098cfa79d637dabff
|
||||
ServiceName:
|
||||
Type: String
|
||||
Default: onyx-web-server
|
||||
TaskCpu:
|
||||
Type: String
|
||||
Default: "1024"
|
||||
TaskMemory:
|
||||
Type: String
|
||||
Default: "2048"
|
||||
TaskDesiredCount:
|
||||
Type: Number
|
||||
Default: 1
|
||||
|
||||
Resources:
|
||||
|
||||
ECSService:
|
||||
Type: AWS::ECS::Service
|
||||
Properties:
|
||||
Cluster:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
|
||||
CapacityProviderStrategy:
|
||||
- CapacityProvider: FARGATE
|
||||
Base: 0
|
||||
Weight: 1
|
||||
TaskDefinition: !Ref TaskDefinition
|
||||
ServiceName: !Sub ${Environment}-${ServiceName}-service
|
||||
SchedulingStrategy: REPLICA
|
||||
DesiredCount: !Ref TaskDesiredCount
|
||||
AvailabilityZoneRebalancing: ENABLED
|
||||
NetworkConfiguration:
|
||||
AwsvpcConfiguration:
|
||||
AssignPublicIp: ENABLED
|
||||
SecurityGroups:
|
||||
- Ref: SecurityGroup
|
||||
Subnets: !Ref SubnetIDs
|
||||
PlatformVersion: LATEST
|
||||
DeploymentConfiguration:
|
||||
MaximumPercent: 200
|
||||
MinimumHealthyPercent: 100
|
||||
DeploymentCircuitBreaker:
|
||||
Enable: true
|
||||
Rollback: true
|
||||
DeploymentController:
|
||||
Type: ECS
|
||||
ServiceConnectConfiguration:
|
||||
Enabled: false
|
||||
ServiceRegistries:
|
||||
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
|
||||
Tags:
|
||||
- Key: app
|
||||
Value: onyx
|
||||
- Key: service
|
||||
Value: !Ref ServiceName
|
||||
- Key: env
|
||||
Value: !Ref Environment
|
||||
EnableECSManagedTags: true
|
||||
|
||||
SecurityGroup:
|
||||
Type: AWS::EC2::SecurityGroup
|
||||
Properties:
|
||||
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
|
||||
GroupName: !Sub ${Environment}-ecs-${ServiceName}
|
||||
VpcId: !Ref VpcID
|
||||
SecurityGroupIngress:
|
||||
- FromPort: 3000
|
||||
ToPort: 3000
|
||||
IpProtocol: tcp
|
||||
CidrIp: 0.0.0.0/0
|
||||
- FromPort: 3000
|
||||
ToPort: 3000
|
||||
IpProtocol: tcp
|
||||
CidrIpv6: "::/0"
|
||||
|
||||
ServiceDiscoveryService:
|
||||
Type: "AWS::ServiceDiscovery::Service"
|
||||
Properties:
|
||||
Name: !Sub ${Environment}-${ServiceName}-service
|
||||
DnsConfig:
|
||||
DnsRecords:
|
||||
- Type: "A"
|
||||
TTL: 15
|
||||
NamespaceId:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
|
||||
HealthCheckCustomConfig:
|
||||
FailureThreshold: 1
|
||||
|
||||
TaskDefinition:
|
||||
Type: AWS::ECS::TaskDefinition
|
||||
Properties:
|
||||
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
|
||||
TaskRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
|
||||
ExecutionRoleArn:
|
||||
Fn::ImportValue:
|
||||
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
|
||||
NetworkMode: awsvpc
|
||||
RequiresCompatibilities:
|
||||
- FARGATE
|
||||
Cpu: !Ref TaskCpu
|
||||
Memory: !Ref TaskMemory
|
||||
RuntimePlatform:
|
||||
CpuArchitecture: ARM64
|
||||
OperatingSystemFamily: LINUX
|
||||
ContainerDefinitions:
|
||||
- Name: onyx-webserver
|
||||
Image: onyxdotapp/onyx-web-server:latest
|
||||
Cpu: 0
|
||||
Essential: true
|
||||
PortMappings:
|
||||
- Name: webserver
|
||||
ContainerPort: 3000
|
||||
HostPort: 3000
|
||||
Protocol: tcp
|
||||
Environment:
|
||||
- Name: NEXT_PUBLIC_DISABLE_STREAMING
|
||||
Value: "false"
|
||||
- Name: NEXT_PUBLIC_NEW_CHAT_DIRECTS_TO_SAME_PERSONA
|
||||
Value: "false"
|
||||
- Name: INTERNAL_URL
|
||||
Value: !Sub
|
||||
- "http://${Environment}-onyx-backend-api-server-service.${ImportedNamespace}:8080"
|
||||
- ImportedNamespace: !ImportValue
|
||||
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
|
||||
LogConfiguration:
|
||||
LogDriver: awslogs
|
||||
Options:
|
||||
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
|
||||
mode: non-blocking
|
||||
awslogs-create-group: "true"
|
||||
max-buffer-size: "25m"
|
||||
awslogs-region: !Ref AWS::Region
|
||||
awslogs-stream-prefix: ecs
|
||||
User: "1000"
|
||||
VolumesFrom: []
|
||||
SystemControls: []
|
||||
|
||||
ECSAutoScalingTarget:
|
||||
Type: AWS::ApplicationAutoScaling::ScalableTarget
|
||||
DependsOn: ECSService
|
||||
Properties:
|
||||
MaxCapacity: 5
|
||||
MinCapacity: 1
|
||||
ResourceId: !Sub
|
||||
- "service/${ImportedCluster}/${Environment}-${ServiceName}-service"
|
||||
- ImportedCluster: !ImportValue
|
||||
'Fn::Sub': "${Environment}-onyx-cluster-ECSClusterName"
|
||||
ServiceName: !Ref ServiceName
|
||||
Environment: !Ref Environment
|
||||
ScalableDimension: ecs:service:DesiredCount
|
||||
ServiceNamespace: ecs
|
||||
|
||||
ECSAutoScalingPolicy:
|
||||
Type: AWS::ApplicationAutoScaling::ScalingPolicy
|
||||
Properties:
|
||||
PolicyName: !Sub ${Environment}-${ServiceName}-service-cpu-scaleout
|
||||
ScalingTargetId: !Ref ECSAutoScalingTarget
|
||||
PolicyType: TargetTrackingScaling
|
||||
TargetTrackingScalingPolicyConfiguration:
|
||||
TargetValue: 75
|
||||
PredefinedMetricSpecification:
|
||||
PredefinedMetricType: ECSServiceAverageCPUUtilization
|
||||
ScaleOutCooldown: 60
|
||||
ScaleInCooldown: 60
|
||||
|
||||
ECSAutoScalingPolicyMemory:
|
||||
Type: AWS::ApplicationAutoScaling::ScalingPolicy
|
||||
Properties:
|
||||
PolicyName: !Sub ${Environment}-${ServiceName}-service-memory-scaleout
|
||||
ScalingTargetId: !Ref ECSAutoScalingTarget
|
||||
PolicyType: TargetTrackingScaling
|
||||
TargetTrackingScalingPolicyConfiguration:
|
||||
TargetValue: 80
|
||||
PredefinedMetricSpecification:
|
||||
PredefinedMetricType: ECSServiceAverageMemoryUtilization
|
||||
ScaleOutCooldown: 60
|
||||
ScaleInCooldown: 60
|
||||
@@ -1,76 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
AWS_REGION="${AWS_REGION:-us-west-1}"
|
||||
|
||||
# Reference to consolidated config
|
||||
CONFIG_FILE="onyx_config.json"
|
||||
|
||||
# Get environment from config file
|
||||
ENVIRONMENT=$(jq -r '.Environment' "$CONFIG_FILE")
|
||||
if [ -z "$ENVIRONMENT" ] || [ "$ENVIRONMENT" == "null" ]; then
|
||||
echo "Missing Environment in $CONFIG_FILE. Please add the Environment field."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Try to get S3_BUCKET from config, fallback to default if not found
|
||||
S3_BUCKET_FROM_CONFIG=$(jq -r '.S3Bucket // empty' "$CONFIG_FILE")
|
||||
if [ -n "$S3_BUCKET_FROM_CONFIG" ]; then
|
||||
S3_BUCKET="$S3_BUCKET_FROM_CONFIG"
|
||||
else
|
||||
S3_BUCKET="${S3_BUCKET:-onyx-ecs-fargate-configs}"
|
||||
fi
|
||||
|
||||
STACK_NAMES=(
|
||||
"${ENVIRONMENT}-onyx-nginx-service"
|
||||
"${ENVIRONMENT}-onyx-web-server-service"
|
||||
"${ENVIRONMENT}-onyx-backend-background-server-service"
|
||||
"${ENVIRONMENT}-onyx-backend-api-server-service"
|
||||
"${ENVIRONMENT}-onyx-model-server-inference-service"
|
||||
"${ENVIRONMENT}-onyx-model-server-indexing-service"
|
||||
"${ENVIRONMENT}-onyx-vespaengine-service"
|
||||
"${ENVIRONMENT}-onyx-redis-service"
|
||||
"${ENVIRONMENT}-onyx-postgres-service"
|
||||
"${ENVIRONMENT}-onyx-cluster"
|
||||
"${ENVIRONMENT}-onyx-acm"
|
||||
"${ENVIRONMENT}-onyx-efs"
|
||||
)
|
||||
|
||||
delete_stack() {
|
||||
local stack_name=$1
|
||||
|
||||
if [ "$stack_name" == "${ENVIRONMENT}-onyx-cluster" ]; then
|
||||
echo "Removing all objects and directories from the onyx config s3 bucket."
|
||||
aws s3 rm "s3://${ENVIRONMENT}-${S3_BUCKET}" --recursive
|
||||
sleep 5
|
||||
fi
|
||||
|
||||
echo "Checking if stack $stack_name exists..."
|
||||
if aws cloudformation describe-stacks --stack-name "$stack_name" --region "$AWS_REGION" > /dev/null 2>&1; then
|
||||
echo "Deleting stack: $stack_name..."
|
||||
aws cloudformation delete-stack \
|
||||
--stack-name "$stack_name" \
|
||||
--region "$AWS_REGION"
|
||||
|
||||
echo "Waiting for stack $stack_name to be deleted..."
|
||||
aws cloudformation wait stack-delete-complete \
|
||||
--stack-name "$stack_name" \
|
||||
--region "$AWS_REGION"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Stack $stack_name deleted successfully."
|
||||
sleep 10
|
||||
else
|
||||
echo "Failed to delete stack $stack_name. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Stack $stack_name does not exist, skipping."
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
for stack_name in "${STACK_NAMES[@]}"; do
|
||||
delete_stack "$stack_name"
|
||||
done
|
||||
|
||||
echo "All stacks deleted successfully."
|
||||
@@ -31,11 +31,11 @@ upstream api_server {
|
||||
|
||||
# for a TCP configuration
|
||||
# TODO: use gunicorn to manage multiple processes
|
||||
server ${ONYX_BACKEND_API_HOST}:8080 fail_timeout=0;
|
||||
server api_server:8080 fail_timeout=0;
|
||||
}
|
||||
|
||||
upstream web_server {
|
||||
server ${ONYX_WEB_SERVER_HOST}:3000 fail_timeout=0;
|
||||
server web_server:3000 fail_timeout=0;
|
||||
}
|
||||
|
||||
server {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Log format to include request latency
|
||||
# Override log format to include request latency
|
||||
log_format custom_main '$remote_addr - $remote_user [$time_local] "$request" '
|
||||
'$status $body_bytes_sent "$http_referer" '
|
||||
'"$http_user_agent" "$http_x_forwarded_for" '
|
||||
'rt=$request_time';
|
||||
'$status $body_bytes_sent "$http_referer" '
|
||||
'"$http_user_agent" "$http_x_forwarded_for" '
|
||||
'rt=$request_time';
|
||||
|
||||
upstream api_server {
|
||||
# fail_timeout=0 means we always retry an upstream even if it failed
|
||||
@@ -13,17 +13,17 @@ upstream api_server {
|
||||
|
||||
# for a TCP configuration
|
||||
# TODO: use gunicorn to manage multiple processes
|
||||
server ${ONYX_BACKEND_API_HOST}:8080 fail_timeout=0;
|
||||
server api_server:8080 fail_timeout=0;
|
||||
}
|
||||
|
||||
upstream web_server {
|
||||
server ${ONYX_WEB_SERVER_HOST}:3000 fail_timeout=0;
|
||||
server web_server:3000 fail_timeout=0;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80 default_server;
|
||||
|
||||
client_max_body_size 5G; # Maximum upload size
|
||||
client_max_body_size 5G; # Maximum upload size
|
||||
|
||||
access_log /var/log/nginx/access.log custom_main;
|
||||
|
||||
@@ -66,5 +66,5 @@ server {
|
||||
proxy_redirect off;
|
||||
proxy_pass http://web_server;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
# fill in the template
|
||||
ONYX_BACKEND_API_HOST="${ONYX_BACKEND_API_HOST:-api_server}"
|
||||
ONYX_WEB_SERVER_HOST="${ONYX_WEB_SERVER_HOST:-web_server}"
|
||||
|
||||
envsubst '$DOMAIN $SSL_CERT_FILE_NAME $SSL_CERT_KEY_FILE_NAME $ONYX_BACKEND_API_HOST $ONYX_WEB_SERVER_HOST' < "/etc/nginx/conf.d/$1" > /etc/nginx/conf.d/app.conf
|
||||
envsubst '$DOMAIN $SSL_CERT_FILE_NAME $SSL_CERT_KEY_FILE_NAME' < "/etc/nginx/conf.d/$1" > /etc/nginx/conf.d/app.conf
|
||||
|
||||
# wait for the api_server to be ready
|
||||
echo "Waiting for API server to boot up; this may take a minute or two..."
|
||||
@@ -13,7 +10,7 @@ echo
|
||||
|
||||
while true; do
|
||||
# Use curl to send a request and capture the HTTP status code
|
||||
status_code=$(curl -o /dev/null -s -w "%{http_code}\n" "http://${ONYX_BACKEND_API_HOST}:8080/health")
|
||||
status_code=$(curl -o /dev/null -s -w "%{http_code}\n" "http://api_server:8080/health")
|
||||
|
||||
# Check if the status code is 200
|
||||
if [ "$status_code" -eq 200 ]; then
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
playwright==1.42.0
|
||||
@@ -1,161 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
async def scrape_twitter_links(url):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(
|
||||
headless=False
|
||||
) # Use non-headless for better scrolling
|
||||
page = await browser.new_page(viewport={"width": 1280, "height": 800})
|
||||
|
||||
print(f"Navigating to main page: {url}")
|
||||
await page.goto(url)
|
||||
await page.wait_for_load_state("networkidle")
|
||||
|
||||
# More aggressive scrolling to load all company cards
|
||||
company_links = set() # Use a set for automatic deduplication
|
||||
no_new_links_count = 0
|
||||
|
||||
print("Starting to scroll and collect company links...")
|
||||
|
||||
# First, try scrolling to the very bottom
|
||||
await scroll_to_bottom(page)
|
||||
|
||||
# Then collect all links
|
||||
prev_size = 0
|
||||
while True:
|
||||
# Get all company links
|
||||
elements = await page.query_selector_all('a[href^="/companies/"]')
|
||||
|
||||
for element in elements:
|
||||
href = await element.get_attribute("href")
|
||||
if href and "/companies/" in href and "?" not in href:
|
||||
company_url = f"https://www.ycombinator.com{href}"
|
||||
company_links.add(company_url)
|
||||
|
||||
current_size = len(company_links)
|
||||
print(f"Found {current_size} unique company links so far...")
|
||||
|
||||
if current_size == prev_size:
|
||||
no_new_links_count += 1
|
||||
if no_new_links_count >= 3:
|
||||
print("No new links found after multiple attempts, ending scroll.")
|
||||
break
|
||||
else:
|
||||
no_new_links_count = 0
|
||||
|
||||
prev_size = current_size
|
||||
|
||||
# Try to click "Load More" button if it exists
|
||||
try:
|
||||
load_more = await page.query_selector('button:has-text("Load More")')
|
||||
if load_more:
|
||||
await load_more.click()
|
||||
print("Clicked 'Load More' button")
|
||||
await page.wait_for_timeout(3000)
|
||||
await scroll_to_bottom(page)
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error clicking Load More: {str(e)}")
|
||||
|
||||
# Scroll more
|
||||
try:
|
||||
await scroll_to_bottom(page)
|
||||
except Exception as e:
|
||||
print(f"Error scrolling: {str(e)}")
|
||||
break
|
||||
|
||||
print(f"Found {len(company_links)} total unique company links after scrolling")
|
||||
|
||||
# Visit each company page and extract Twitter links
|
||||
twitter_data = []
|
||||
|
||||
for i, company_url in enumerate(sorted(company_links)):
|
||||
print(f"Processing company {i+1}/{len(company_links)}: {company_url}")
|
||||
try:
|
||||
await page.goto(company_url)
|
||||
await page.wait_for_load_state("networkidle")
|
||||
|
||||
# Extract company name from URL
|
||||
company_name = company_url.split("/")[-1]
|
||||
|
||||
# Find all links on the page
|
||||
all_links = await page.query_selector_all("a")
|
||||
twitter_links = []
|
||||
|
||||
for link in all_links:
|
||||
href = await link.get_attribute("href")
|
||||
if href and ("twitter.com" in href or "x.com" in href):
|
||||
twitter_links.append(href)
|
||||
|
||||
if twitter_links:
|
||||
for twitter_link in twitter_links:
|
||||
twitter_data.append(f"{company_name}: {twitter_link}")
|
||||
else:
|
||||
twitter_data.append(f"{company_name}: No Twitter/X link found")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {company_url}: {str(e)}")
|
||||
|
||||
await browser.close()
|
||||
return twitter_data
|
||||
|
||||
|
||||
async def scroll_to_bottom(page):
|
||||
"""Aggressively scroll to the bottom of the page."""
|
||||
print("Scrolling to bottom...")
|
||||
|
||||
# Get the current height of the page
|
||||
await page.evaluate("document.body.scrollHeight")
|
||||
|
||||
# while True:
|
||||
# Scroll to bottom
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000) # Wait for content to load
|
||||
|
||||
# Check if we've reached the bottom
|
||||
await page.evaluate("document.body.scrollHeight")
|
||||
# if current_height == prev_height:
|
||||
# break
|
||||
|
||||
# Additional scrolls for extra measure
|
||||
for _ in range(3):
|
||||
await page.keyboard.press("End")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Scrape Twitter links from YC company pages"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default="https://www.ycombinator.com/companies?batch=W23&batch=S23&batch=S24&batch=F24&batch=S22&batch=W22&query=San%20Francisco",
|
||||
help="URL to scrape (default: YC companies from recent batches)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="twitter_links.txt",
|
||||
help="Output file name (default: twitter_links.txt)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headless", action="store_true", help="Run in headless mode (default: False)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
twitter_links = await scrape_twitter_links(args.url)
|
||||
|
||||
# Save to file
|
||||
with open(args.output, "w") as f:
|
||||
f.write("\n".join(twitter_links))
|
||||
|
||||
print(f"Saved {len(twitter_links)} results to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
1419
twitter_links.txt
1419
twitter_links.txt
File diff suppressed because it is too large
Load Diff
@@ -3018,11 +3018,7 @@ export function ChatPage({
|
||||
currentAlternativeAssistant
|
||||
}
|
||||
messageId={message.messageId}
|
||||
content={
|
||||
userFiles
|
||||
? message.message
|
||||
: "message.message"
|
||||
}
|
||||
content={message.message}
|
||||
files={message.files}
|
||||
query={
|
||||
messageHistory[i]?.query || undefined
|
||||
|
||||
@@ -508,11 +508,7 @@ export const AIMessage = ({
|
||||
userKnowledgeFiles={userKnowledgeFiles}
|
||||
/>
|
||||
)}
|
||||
{userKnowledgeFiles ? (
|
||||
<div className="h-10 w-10 rounded-full bg-black" />
|
||||
) : (
|
||||
<div className="h-10 w-10 rounded-full bg-red-400" />
|
||||
)}
|
||||
|
||||
{!userKnowledgeFiles &&
|
||||
toolCall &&
|
||||
!TOOLS_WITH_CUSTOM_HANDLING.includes(
|
||||
|
||||
@@ -43,13 +43,13 @@ const DropdownOption: React.FC<DropdownOptionProps> = ({
|
||||
|
||||
if (href) {
|
||||
return (
|
||||
<Link
|
||||
<a
|
||||
href={href}
|
||||
target={openInNewTab ? "_blank" : undefined}
|
||||
rel={openInNewTab ? "noopener noreferrer" : undefined}
|
||||
>
|
||||
{content}
|
||||
</Link>
|
||||
</a>
|
||||
);
|
||||
} else {
|
||||
return <div onClick={onClick}>{content}</div>;
|
||||
|
||||
@@ -377,10 +377,7 @@ export function listSourceMetadata(): SourceMetadata[] {
|
||||
display in the Add Connector page */
|
||||
const entries = Object.entries(SOURCE_METADATA_MAP)
|
||||
.filter(
|
||||
([source, _]) =>
|
||||
source !== "not_applicable" &&
|
||||
source !== "ingestion_api" &&
|
||||
source !== "mock_connector"
|
||||
([source, _]) => source !== "not_applicable" && source != "ingestion_api"
|
||||
)
|
||||
.map(([source, metadata]) => {
|
||||
return fillSourceMetadata(metadata, source as ValidSources);
|
||||
|
||||
Reference in New Issue
Block a user