checkpoint

2026-03-15 20:52:39 +00:00 · 2024-12-27 17:54:21 -08:00 · 2024-12-24 10:32:03 -08:00
4 changed files with 253 additions and 1 deletions
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -105,7 +105,7 @@ TRACK_EXTERNAL_IDP_EXPIRY = (


 #####
-# DB Configs
+# DB/Index/Task Configs
 #####
 DOCUMENT_INDEX_NAME = "danswer_index"
 # Vespa is now the default document index store for both keyword and vector
@@ -130,6 +130,14 @@ VESPA_DEPLOYMENT_ZIP = (
 VESPA_CLOUD_CERT_PATH = os.environ.get("VESPA_CLOUD_CERT_PATH")
 VESPA_CLOUD_KEY_PATH = os.environ.get("VESPA_CLOUD_KEY_PATH")

+
+# Opensearch
+OPENSEARCH_HOST = os.environ.get("OPENSEARCH_HOST") or "localhost"
+OPENSEARCH_PORT = os.environ.get("OPENSEARCH_PORT") or "9200"
+OPENSEARCH_USER = os.environ.get("OPENSEARCH_USER") or "admin"
+OPENSEARCH_PASSWORD = os.environ.get("OPENSEARCH_PASSWORD") or "0n^x_1ndex"
+
+
 # Number of documents in a batch during indexing (further batching done by chunks before passing to bi-encoder)
 try:
    INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE", 16))
--- a/backend/onyx/document_index/opensearch/constants.py
+++ b/backend/onyx/document_index/opensearch/constants.py
@@ -0,0 +1,34 @@
+# TUNABLE PARAMETERS, TODO evaluate these
+# These should get ported to somewhere else probably eventually
+SHARDS = 1
+
+EF_CONSTRUCTION = 200
+M = 50
+
+DOCUMENT_ID_FIELD = "document_id"
+CHUNK_ID_FIELD = "chunk_id"
+LARGE_CHUNK_START_ID_FIELD = "large_chunk_start_id"
+LARGE_CHUNK_END_ID_FIELD = "large_chunk_end_id"
+TITLE_FIELD = "title"
+CONTENT_FIELD = "content"
+TITLE_EMBEDDING_FIELD = "title_embedding"
+CHUNK_EMBEDDING_FIELD = "chunk_embeddings"
+VECTOR_SUBFIELD = "vector"
+HIDDEN_FIELD = "hidden"
+SOURCE_TYPE_FIELD = "source_type"
+DOCUMENT_SETS_FIELD = "document_sets"
+METADATA_FIELD = "metadata"
+KEY_SUBFIELD = "key"
+VALUE_SUBFIELD = "value"
+BOOST_COUNT_FIELD = "boost_count"
+DOC_UPDATED_AT_FIELD = "doc_updated_at"
+ACCESS_CONTROL_LIST_FIELD = "access_control_list"
+CHUNKS_ABOVE_FIELD = "chunks_above"
+CHUNKS_BELOW_FIELD = "chunks_below"
+SEMANTIC_IDENTIFIER_FIELD = "semantic_identifier"
+LINK_FIELD = "link"
+METADATA_SUFFIX_FIELD = "metadata_suffix"
+PRIMARY_OWNERS_FIELD = "primary_owners"
+SECONDARY_OWNERS_FIELD = "secondary_owners"
+
+NORMALIZATION_PROCESSOR_ID = "normalization_step"
--- a/backend/onyx/document_index/opensearch/utils.py
+++ b/backend/onyx/document_index/opensearch/utils.py
@@ -0,0 +1,209 @@
+from typing import Any
+
+from opensearchpy import OpenSearch
+
+from onyx.configs.app_configs import OPENSEARCH_HOST
+from onyx.configs.app_configs import OPENSEARCH_PASSWORD
+from onyx.configs.app_configs import OPENSEARCH_PORT
+from onyx.configs.app_configs import OPENSEARCH_USER
+from onyx.document_index.opensearch.constants import ACCESS_CONTROL_LIST_FIELD
+from onyx.document_index.opensearch.constants import CHUNK_EMBEDDING_FIELD
+from onyx.document_index.opensearch.constants import CHUNK_ID_FIELD
+from onyx.document_index.opensearch.constants import CHUNKS_ABOVE_FIELD
+from onyx.document_index.opensearch.constants import CHUNKS_BELOW_FIELD
+from onyx.document_index.opensearch.constants import CONTENT_FIELD
+from onyx.document_index.opensearch.constants import DOC_UPDATED_AT_FIELD
+from onyx.document_index.opensearch.constants import DOCUMENT_ID_FIELD
+from onyx.document_index.opensearch.constants import DOCUMENT_SETS_FIELD
+from onyx.document_index.opensearch.constants import EF_CONSTRUCTION
+from onyx.document_index.opensearch.constants import HIDDEN_FIELD
+from onyx.document_index.opensearch.constants import KEY_SUBFIELD
+from onyx.document_index.opensearch.constants import LARGE_CHUNK_END_ID_FIELD
+from onyx.document_index.opensearch.constants import LARGE_CHUNK_START_ID_FIELD
+from onyx.document_index.opensearch.constants import LINK_FIELD
+from onyx.document_index.opensearch.constants import M
+from onyx.document_index.opensearch.constants import METADATA_FIELD
+from onyx.document_index.opensearch.constants import METADATA_SUFFIX_FIELD
+from onyx.document_index.opensearch.constants import PRIMARY_OWNERS_FIELD
+from onyx.document_index.opensearch.constants import SECONDARY_OWNERS_FIELD
+from onyx.document_index.opensearch.constants import SEMANTIC_IDENTIFIER_FIELD
+from onyx.document_index.opensearch.constants import SHARDS
+from onyx.document_index.opensearch.constants import SOURCE_TYPE_FIELD
+from onyx.document_index.opensearch.constants import TITLE_EMBEDDING_FIELD
+from onyx.document_index.opensearch.constants import TITLE_FIELD
+from onyx.document_index.opensearch.constants import VALUE_SUBFIELD
+from onyx.document_index.opensearch.constants import VECTOR_SUBFIELD
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def create_opensearch_client(
+    host: str = OPENSEARCH_HOST,
+    port: str = OPENSEARCH_PORT,
+    user: str = OPENSEARCH_USER,
+    password: str = OPENSEARCH_PASSWORD,
+) -> OpenSearch:
+    opensearch_client = OpenSearch(
+        hosts=[{"host": host, "port": port}],
+        http_auth=(user, password),
+        use_ssl=True,
+        verify_certs=False,
+        ssl_show_warn=False,
+    )
+    return opensearch_client
+
+
+#####
+# Schema Utils
+#####
+def get_schema_settings(shards: int = SHARDS) -> dict[str, Any]:
+    schema_settings = {"index": {"number_of_shards": shards, "knn": True}}
+    return schema_settings
+
+
+def get_hnsw_config(
+    embedding_dim: int, ef_construction: int = EF_CONSTRUCTION, m: int = M
+) -> dict[str, Any]:
+    hnsw_config = {
+        "type": "knn_vector",
+        "dimension": embedding_dim,
+        "method": {
+            "name": "hnsw",
+            "space_type": "cosinesimil",
+            "engine": "lucene",
+            "parameters": {"ef_construction": ef_construction, "m": m},
+        },
+    }
+    return hnsw_config
+
+
+def get_danswer_opensearch_schema(embedding_dim: int) -> dict[str, Any]:
+    full_schema = {
+        "settings": get_schema_settings(),
+        "mappings": {
+            "properties": {
+                # Identification Fields
+                DOCUMENT_ID_FIELD: {"type": "text"},
+                CHUNK_ID_FIELD: {"type": "integer"},
+                LARGE_CHUNK_START_ID_FIELD: {"type": "integer"},
+                LARGE_CHUNK_END_ID_FIELD: {"type": "integer"},
+                # Search Fields
+                TITLE_FIELD: {"type": "text"},
+                CONTENT_FIELD: {"type": "text"},
+                METADATA_SUFFIX_FIELD: {"type": "text"},
+                TITLE_EMBEDDING_FIELD: get_hnsw_config(embedding_dim=embedding_dim),
+                CHUNK_EMBEDDING_FIELD: {
+                    # Only will have multiple for mini chunks, otherwise it will be a list of 1
+                    "type": "nested",
+                    "properties": {
+                        VECTOR_SUBFIELD: get_hnsw_config(embedding_dim=embedding_dim)
+                    },
+                },
+                # Filter Fields
+                HIDDEN_FIELD: {"type": "boolean", "null_value": False},
+                SOURCE_TYPE_FIELD: {"type": "keyword"},
+                DOCUMENT_SETS_FIELD: {"type": "keyword"},
+                METADATA_FIELD: {
+                    "type": "nested",
+                    "properties": {
+                        KEY_SUBFIELD: {"type": "keyword"},
+                        VALUE_SUBFIELD: {"type": "keyword"},
+                    },
+                },
+                DOC_UPDATED_AT_FIELD: {"type": "date"},
+                # ACL
+                ACCESS_CONTROL_LIST_FIELD: {"type": "keyword"},
+                # Not indexed, for use post-retrieval
+                # chunks above/below are stored as extra info on disk so that we can retrieve them for
+                # context without running a second query to fetch the context around the current chunk
+                # TODO include these actually
+                CHUNKS_ABOVE_FIELD: {
+                    "type": "text",
+                    "index": False,
+                    "doc_values": False,
+                },
+                CHUNKS_BELOW_FIELD: {
+                    "type": "text",
+                    "index": False,
+                    "doc_values": False,
+                },
+                SEMANTIC_IDENTIFIER_FIELD: {
+                    "type": "text",
+                    "index": False,
+                    "doc_values": False,
+                },
+                LINK_FIELD: {"type": "text", "index": False, "doc_values": False},
+                # All fields are array fields by default
+                PRIMARY_OWNERS_FIELD: {
+                    "type": "keyword",
+                    "index": False,
+                    "doc_values": False,
+                },
+                SECONDARY_OWNERS_FIELD: {
+                    "type": "keyword",
+                    "index": False,
+                    "doc_values": False,
+                },
+            }
+        },
+    }
+    return full_schema
+
+
+def create_index(index_name: str, embedding_dim: int) -> None:
+    logger.info(f"Creating index {index_name} with embedding dimension {embedding_dim}")
+    opensearch_client = create_opensearch_client()
+    opensearch_client.indices.create(
+        index=index_name,
+        body=get_danswer_opensearch_schema(embedding_dim=embedding_dim),
+    )
+
+
+#####
+# Query Utils
+#####
+def get_normalization_search_pipeline_settings(
+    keyword_weighting: float = 0.4,
+    title_vector_boost_weighting: float = 0.1,
+    chunk_vector_weighting: float = 0.5,
+) -> dict[str, Any]:
+    # TODO: Explore hyperparameters
+    # Note: The expectation is that the Keyword component encompases both the Title and the Chunk texts
+    # additionally that the Title field is upweighted already by the time it hits this step
+    # The title is also expected to be included in the chunk text for the vectorizing so the extra title
+    # boost is ADDITIONAL
+    pipeline_settings = {
+        "description": "Normalization for keyword and vector scores",
+        "phase_results_processors": [
+            {
+                "normalization-processor": {
+                    "normalization": {"technique": "min_max"},
+                    "combination": {
+                        "technique": "arithmetic_mean",
+                        "parameters": {
+                            "weights": [
+                                keyword_weighting,
+                                title_vector_boost_weighting,
+                                chunk_vector_weighting,
+                            ]
+                        },
+                    },
+                }
+            }
+        ],
+    }
+    return pipeline_settings
+
+
+def get_query_base(max_num_results: int) -> dict[str, Any]:
+    query_base = {
+        "size": max_num_results,
+        "query": {"bool": {"must": [], "filter": []}},
+    }
+    return query_base
+
+
+def get_not_hidden_filter() -> dict[str, Any]:
+    not_hidden_filter = {"term": {"not_hidden": True}}
+    return not_hidden_filter
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -40,6 +40,7 @@ Office365-REST-Python-Client==2.5.9
 oauthlib==3.2.2
 openai==1.55.3
 openpyxl==3.1.2
+opensearch-py==2.8.0
 playwright==1.41.2
 psutil==5.9.5
 psycopg2-binary==2.9.9
Author	SHA1	Message	Date
Yuhong Sun	31fc7bc1a9	checkpoint	2024-12-27 17:54:21 -08:00
Yuhong Sun	5bab9c1d6d	checkpoint	2024-12-24 10:32:03 -08:00