mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-26 12:15:48 +00:00
Compare commits
45 Commits
v0.4.19
...
benchmarki
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
36c30a7d94 | ||
|
|
4ba37c255a | ||
|
|
e8eb89f6ed | ||
|
|
aacee9b21a | ||
|
|
4cc0697b66 | ||
|
|
d5d7772524 | ||
|
|
4326866a9d | ||
|
|
609c387e9a | ||
|
|
b66a0df069 | ||
|
|
46ff75e675 | ||
|
|
279a3a02f8 | ||
|
|
1954164ec4 | ||
|
|
c6f08cd1cd | ||
|
|
d9ada5d290 | ||
|
|
0a1fdb80db | ||
|
|
e94d753821 | ||
|
|
8e81b935d1 | ||
|
|
17866dcf8b | ||
|
|
b427fac6d7 | ||
|
|
9652af82fe | ||
|
|
340f703ff5 | ||
|
|
8de8c00531 | ||
|
|
d82bd97dd4 | ||
|
|
39341516ab | ||
|
|
54f5248eb1 | ||
|
|
97412c9b7a | ||
|
|
2f770264c8 | ||
|
|
8777749646 | ||
|
|
22d7427e45 | ||
|
|
f4e80c9f22 | ||
|
|
cb06cf7481 | ||
|
|
0635a81d5e | ||
|
|
437c5856d0 | ||
|
|
1a58ad4276 | ||
|
|
a9e0967771 | ||
|
|
da81e843d2 | ||
|
|
34980974ac | ||
|
|
61ea59affc | ||
|
|
83d5d49323 | ||
|
|
ca397c3121 | ||
|
|
d2f3e0165a | ||
|
|
90398f5e56 | ||
|
|
7c2c5563e3 | ||
|
|
99df68f5ac | ||
|
|
ebec047aa5 |
@@ -31,6 +31,8 @@ schema danswer_chunk {
|
||||
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
|
||||
field source_type type string {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
# Can also index links https://docs.vespa.ai/en/reference/schema-reference.html#attribute
|
||||
# URL type matching
|
||||
@@ -61,6 +63,8 @@ schema danswer_chunk {
|
||||
}
|
||||
field hidden type bool {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field metadata type string {
|
||||
indexing: summary | attribute
|
||||
@@ -82,10 +86,12 @@ schema danswer_chunk {
|
||||
}
|
||||
field access_control_list type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field document_sets type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,17 @@
|
||||
<disk>0.75</disk>
|
||||
</resource-limits>
|
||||
</tuning>
|
||||
<engine>
|
||||
<proton>
|
||||
<tuning>
|
||||
<searchnode>
|
||||
<requestthreads>
|
||||
<persearch>16</persearch>
|
||||
</requestthreads>
|
||||
</searchnode>
|
||||
</tuning>
|
||||
</proton>
|
||||
</engine>
|
||||
<config name="vespa.config.search.summary.juniperrc">
|
||||
<max_matches>3</max_matches>
|
||||
<length>750</length>
|
||||
|
||||
@@ -58,6 +58,7 @@ from danswer.search.search_runner import query_processing
|
||||
from danswer.search.search_runner import remove_stop_words
|
||||
from danswer.utils.batching import batch_generator
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.timing import log_function_time
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -298,6 +299,8 @@ def _index_vespa_chunks(
|
||||
|
||||
|
||||
def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) -> str:
|
||||
return ""
|
||||
|
||||
def _build_or_filters(key: str, vals: list[str] | None) -> str:
|
||||
if vals is None:
|
||||
return ""
|
||||
@@ -412,7 +415,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
|
||||
|
||||
return InferenceChunk(
|
||||
chunk_id=fields[CHUNK_ID],
|
||||
blurb=fields[BLURB],
|
||||
blurb=fields.get(BLURB, ""),
|
||||
content=fields[CONTENT],
|
||||
source_links=source_links_dict,
|
||||
section_continuation=fields[SECTION_CONTINUATION],
|
||||
@@ -429,13 +432,27 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
|
||||
)
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||
raise ValueError("No/empty query received")
|
||||
response = requests.get(SEARCH_ENDPOINT, params=query_params)
|
||||
|
||||
logger.info("Making query with params: %s", query_params)
|
||||
response = requests.get(
|
||||
SEARCH_ENDPOINT,
|
||||
params=dict(
|
||||
**query_params,
|
||||
**{
|
||||
"presentation.timing": True,
|
||||
},
|
||||
),
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
hits = response.json()["root"].get("children", [])
|
||||
response_json = response.json()
|
||||
logger.debug("Response: %s", response_json)
|
||||
logger.info("timing info: %s", response_json.get("timing"))
|
||||
hits = response_json["root"].get("children", [])
|
||||
|
||||
for hit in hits:
|
||||
if hit["fields"].get(CONTENT) is None:
|
||||
@@ -447,7 +464,6 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||
)
|
||||
|
||||
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
|
||||
|
||||
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
|
||||
return inference_chunks
|
||||
|
||||
@@ -467,7 +483,7 @@ class VespaIndex(DocumentIndex):
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
f"{DOC_UPDATED_AT}, "
|
||||
f"{METADATA}, "
|
||||
f"{METADATA} "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {DOCUMENT_INDEX_NAME} where "
|
||||
)
|
||||
@@ -617,6 +633,7 @@ class VespaIndex(DocumentIndex):
|
||||
"hits": num_to_retrieve,
|
||||
"offset": 0,
|
||||
"ranking.profile": "keyword_search",
|
||||
"timeout": "10s",
|
||||
}
|
||||
|
||||
return _query_vespa(params)
|
||||
@@ -656,6 +673,7 @@ class VespaIndex(DocumentIndex):
|
||||
"hits": num_to_retrieve,
|
||||
"offset": 0,
|
||||
"ranking.profile": "semantic_search",
|
||||
"timeout": "10s",
|
||||
}
|
||||
|
||||
return _query_vespa(params)
|
||||
@@ -695,6 +713,7 @@ class VespaIndex(DocumentIndex):
|
||||
"hits": num_to_retrieve,
|
||||
"offset": 0,
|
||||
"ranking.profile": "hybrid_search",
|
||||
"timeout": "10s",
|
||||
}
|
||||
|
||||
return _query_vespa(params)
|
||||
|
||||
@@ -61,6 +61,7 @@ def query_processing(
|
||||
return query
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def embed_query(
|
||||
query: str,
|
||||
embedding_model: SentenceTransformer | None = None,
|
||||
@@ -362,6 +363,7 @@ def danswer_search(
|
||||
search_type=question.search_type,
|
||||
filters=final_filters,
|
||||
favor_recent=True if question.favor_recent is None else question.favor_recent,
|
||||
skip_rerank=question.skip_rerank,
|
||||
)
|
||||
|
||||
ranked_chunks, unranked_chunks = search_chunks(
|
||||
|
||||
@@ -198,6 +198,7 @@ class QuestionRequest(BaseModel):
|
||||
enable_auto_detect_filters: bool
|
||||
favor_recent: bool | None = None
|
||||
search_type: SearchType = SearchType.HYBRID
|
||||
skip_rerank: bool = False
|
||||
|
||||
|
||||
class QAFeedbackRequest(BaseModel):
|
||||
|
||||
215
backend/scripts/benchmark_search.py
Normal file
215
backend/scripts/benchmark_search.py
Normal file
@@ -0,0 +1,215 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
import nltk
|
||||
import requests
|
||||
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.search.models import SearchType
|
||||
|
||||
question_bank = [
|
||||
"Who was the first president of the United States?",
|
||||
"What is photosynthesis?",
|
||||
"How long is the Great Wall of China?",
|
||||
"When was the Eiffel Tower constructed?",
|
||||
"Who wrote 'Pride and Prejudice'?",
|
||||
"What's the difference between mitosis and meiosis?",
|
||||
"What is the capital of Brazil?",
|
||||
"Who discovered penicillin?",
|
||||
"What causes the Aurora Borealis?",
|
||||
"When did the Titanic sink?",
|
||||
"How does a combustion engine work?",
|
||||
"Who is the author of 'The Odyssey'?",
|
||||
"What is quantum physics?",
|
||||
"When was the Mona Lisa painted?",
|
||||
"What's the difference between a meteor and a meteorite?",
|
||||
"Who founded the city of Rome?",
|
||||
"What is the boiling point of water at sea level?",
|
||||
"Who won the Nobel Prize in Literature in 1953?",
|
||||
"How do honeybees produce honey?",
|
||||
"What is the deepest part of the ocean?",
|
||||
"When did the first humans arrive in the Americas?",
|
||||
"What is the Fibonacci sequence?",
|
||||
"How was the Grand Canyon formed?",
|
||||
"Who composed the Moonlight Sonata?",
|
||||
"What are the primary colors of light?",
|
||||
"When did the Roman Empire fall?",
|
||||
"How does photosynthesis contribute to the carbon cycle?",
|
||||
"Who was the first woman in space?",
|
||||
"What is the Pythagorean theorem?",
|
||||
"Which planet is known as the 'Red Planet'?",
|
||||
"Who is the father of modern physics?",
|
||||
"What is the primary purpose of the United Nations?",
|
||||
"How old is the Earth?",
|
||||
"Who wrote 'Don Quixote'?",
|
||||
"What is the structure of DNA?",
|
||||
"When was the Declaration of Independence signed?",
|
||||
"What causes a solar eclipse?",
|
||||
"Who was the longest-reigning British monarch?",
|
||||
"How do tornadoes form?",
|
||||
"Who developed the theory of relativity?",
|
||||
"What's the tallest mountain on Earth when measured from base to peak?",
|
||||
"How many bones are there in the adult human body?",
|
||||
"When was the Internet invented?",
|
||||
"Who was the ancient Egyptian queen known for her relationship with Roman leaders?",
|
||||
"What is the Krebs cycle?",
|
||||
"Which country has the largest land area?",
|
||||
"Who painted the Starry Night?",
|
||||
"What's the difference between an alligator and a crocodile?",
|
||||
"Who discovered the circulation of blood?",
|
||||
"How many planets are there in our solar system?",
|
||||
]
|
||||
|
||||
additional_questions = [
|
||||
"Who wrote the play 'Hamlet'?",
|
||||
"What is the speed of light in a vacuum?",
|
||||
"When did World War I begin?",
|
||||
"Who was known as the 'Father of Medicine'?",
|
||||
"What's the largest mammal on Earth?",
|
||||
"Which element has the atomic number 79?",
|
||||
"When did the Renaissance period begin?",
|
||||
"What is the currency used in Japan?",
|
||||
"Who proposed the theory of evolution by natural selection?",
|
||||
"Which planet has a day that lasts longer than its year?",
|
||||
"What is the capital of Australia?",
|
||||
"Who painted the Last Supper?",
|
||||
"How do plants get their green color?",
|
||||
"When was the Magna Carta signed?",
|
||||
"What are the building blocks of proteins?",
|
||||
"Which civilization built Machu Picchu?",
|
||||
"What's the most abundant gas in Earth's atmosphere?",
|
||||
"Who translated the Bible into German during the Reformation?",
|
||||
"What causes the tides in the ocean?",
|
||||
"When did the Olympic Games originate?",
|
||||
"What is a black hole?",
|
||||
"Which river is the longest in the world?",
|
||||
"Who invented the telephone?",
|
||||
"When was the French Revolution?",
|
||||
"What is the smallest prime number?",
|
||||
"Which country is known as the Land of the Rising Sun?",
|
||||
"Who composed the Four Seasons?",
|
||||
"What is the periodic table?",
|
||||
"When was the Great Depression?",
|
||||
"What is the primary function of red blood cells?",
|
||||
"Who is known for his laws of motion?",
|
||||
"Which ancient wonder was located in the city of Babylon?",
|
||||
"What are the base pairs in DNA?",
|
||||
"When was the first airplane flight?",
|
||||
"What's the main ingredient in guacamole?",
|
||||
"Which empire was ruled by Suleiman the Magnificent?",
|
||||
"What is the human body's largest organ?",
|
||||
"Who authored 'Brave New World'?",
|
||||
"How does electricity work?",
|
||||
"When did the Cold War end?",
|
||||
"What's the difference between prokaryotic and eukaryotic cells?",
|
||||
"Which mountain range includes Mount Everest?",
|
||||
"Who is the Greek god of war?",
|
||||
"When was the printing press invented?",
|
||||
"What are antibiotics used for?",
|
||||
"Which desert is the driest on Earth?",
|
||||
"Who was the first African American U.S. Supreme Court Justice?",
|
||||
"How many teeth do adult humans typically have?",
|
||||
"Who is the protagonist in 'The Catcher in the Rye'?",
|
||||
"What is the study of fossils called?",
|
||||
]
|
||||
|
||||
# Download the wordlist
|
||||
nltk.download("words")
|
||||
from nltk.corpus import words # noqa: E402
|
||||
|
||||
|
||||
def generate_random_sentence():
|
||||
word_list = words.words()
|
||||
sentence_length = random.randint(5, 10)
|
||||
sentence = " ".join(random.choices(word_list, k=sentence_length))
|
||||
return sentence
|
||||
|
||||
|
||||
def _measure_search_latency(
|
||||
query: str,
|
||||
search_type: SearchType,
|
||||
skip_rerank: bool = True,
|
||||
enable_auto_detect_filters: bool = False,
|
||||
filters: dict | None = None,
|
||||
):
|
||||
start = time.monotonic()
|
||||
response = requests.post(
|
||||
"http://localhost:8080/document-search",
|
||||
json={
|
||||
"query": query,
|
||||
"collection": DOCUMENT_INDEX_NAME,
|
||||
"filters": filters or {},
|
||||
"enable_auto_detect_filters": enable_auto_detect_filters,
|
||||
"search_type": search_type,
|
||||
"skip_rerank": skip_rerank,
|
||||
},
|
||||
)
|
||||
if not response.ok:
|
||||
raise Exception(f"Failed to search: {response.text}")
|
||||
return time.monotonic() - start
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sentences = question_bank + additional_questions
|
||||
num_trials = 100
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_trials):
|
||||
latencies.append(
|
||||
_measure_search_latency(query=sentences[i], search_type=SearchType.KEYWORD)
|
||||
)
|
||||
print("Latency", latencies[-1])
|
||||
|
||||
latencies = sorted(latencies)
|
||||
|
||||
print(f"[Keyword] Average latency: {sum(latencies) / len(latencies)}")
|
||||
print(f"[Keyword] P50: {latencies[int(num_trials * 0.5)]}")
|
||||
print(f"[Keyword] P95: {latencies[int(num_trials * 0.95)]}")
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_trials):
|
||||
latencies.append(
|
||||
_measure_search_latency(query=sentences[i], search_type=SearchType.HYBRID)
|
||||
)
|
||||
print("Latency", latencies[-1])
|
||||
|
||||
latencies = sorted(latencies)
|
||||
|
||||
print(f"[Hybrid] Average latency: {sum(latencies) / len(latencies)}")
|
||||
print(f"[Hybrid] P50: {latencies[int(num_trials * 0.5)]}")
|
||||
print(f"[Hybrid] P95: {latencies[int(num_trials * 0.95)]}")
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_trials):
|
||||
latencies.append(
|
||||
_measure_search_latency(
|
||||
query=sentences[i],
|
||||
search_type=SearchType.HYBRID,
|
||||
skip_rerank=False,
|
||||
)
|
||||
)
|
||||
print("Latency", latencies[-1])
|
||||
|
||||
latencies = sorted(latencies)
|
||||
|
||||
print(f"[Hybrid + CE] Average latency: {sum(latencies) / len(latencies)}")
|
||||
print(f"[Hybrid + CE] P50: {latencies[int(num_trials * 0.5)]}")
|
||||
print(f"[Hybrid + CE] P95: {latencies[int(num_trials * 0.95)]}")
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_trials):
|
||||
latencies.append(
|
||||
_measure_search_latency(
|
||||
query=sentences[i],
|
||||
search_type=SearchType.HYBRID,
|
||||
skip_rerank=False,
|
||||
enable_auto_detect_filters=True,
|
||||
)
|
||||
)
|
||||
print("Latency", latencies[-1])
|
||||
|
||||
latencies = sorted(latencies)
|
||||
|
||||
print(f"[Hybrid + CE + filters] Average latency: {sum(latencies) / len(latencies)}")
|
||||
print(f"[Hybrid + CE + filters] P50: {latencies[int(num_trials * 0.5)]}")
|
||||
print(f"[Hybrid + CE + filters] P95: {latencies[int(num_trials * 0.95)]}")
|
||||
191
backend/scripts/benchmark_search_isolated.py
Normal file
191
backend/scripts/benchmark_search_isolated.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from collections.abc import Mapping
|
||||
|
||||
import nltk
|
||||
import requests
|
||||
|
||||
from danswer.configs.app_configs import DOC_TIME_DECAY
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.document_index.vespa.index import SEARCH_ENDPOINT
|
||||
from danswer.search.search_runner import embed_query
|
||||
|
||||
# Download the wordlist
|
||||
nltk.download("words")
|
||||
from nltk.corpus import words # noqa: E402
|
||||
|
||||
question_bank = [
|
||||
"Who was the first president of the United States?",
|
||||
"What is photosynthesis?",
|
||||
"How long is the Great Wall of China?",
|
||||
"When was the Eiffel Tower constructed?",
|
||||
"Who wrote 'Pride and Prejudice'?",
|
||||
"What's the difference between mitosis and meiosis?",
|
||||
"What is the capital of Brazil?",
|
||||
"Who discovered penicillin?",
|
||||
"What causes the Aurora Borealis?",
|
||||
"When did the Titanic sink?",
|
||||
"How does a combustion engine work?",
|
||||
"Who is the author of 'The Odyssey'?",
|
||||
"What is quantum physics?",
|
||||
"When was the Mona Lisa painted?",
|
||||
"What's the difference between a meteor and a meteorite?",
|
||||
"Who founded the city of Rome?",
|
||||
"What is the boiling point of water at sea level?",
|
||||
"Who won the Nobel Prize in Literature in 1953?",
|
||||
"How do honeybees produce honey?",
|
||||
"What is the deepest part of the ocean?",
|
||||
"When did the first humans arrive in the Americas?",
|
||||
"What is the Fibonacci sequence?",
|
||||
"How was the Grand Canyon formed?",
|
||||
"Who composed the Moonlight Sonata?",
|
||||
"What are the primary colors of light?",
|
||||
"When did the Roman Empire fall?",
|
||||
"How does photosynthesis contribute to the carbon cycle?",
|
||||
"Who was the first woman in space?",
|
||||
"What is the Pythagorean theorem?",
|
||||
"Which planet is known as the 'Red Planet'?",
|
||||
"Who is the father of modern physics?",
|
||||
"What is the primary purpose of the United Nations?",
|
||||
"How old is the Earth?",
|
||||
"Who wrote 'Don Quixote'?",
|
||||
"What is the structure of DNA?",
|
||||
"When was the Declaration of Independence signed?",
|
||||
"What causes a solar eclipse?",
|
||||
"Who was the longest-reigning British monarch?",
|
||||
"How do tornadoes form?",
|
||||
"Who developed the theory of relativity?",
|
||||
"What's the tallest mountain on Earth when measured from base to peak?",
|
||||
"How many bones are there in the adult human body?",
|
||||
"When was the Internet invented?",
|
||||
"Who was the ancient Egyptian queen known for her relationship with Roman leaders?",
|
||||
"What is the Krebs cycle?",
|
||||
"Which country has the largest land area?",
|
||||
"Who painted the Starry Night?",
|
||||
"What's the difference between an alligator and a crocodile?",
|
||||
"Who discovered the circulation of blood?",
|
||||
"How many planets are there in our solar system?",
|
||||
]
|
||||
|
||||
additional_questions = [
|
||||
"Who wrote the play 'Hamlet'?",
|
||||
"What is the speed of light in a vacuum?",
|
||||
"When did World War I begin?",
|
||||
"Who was known as the 'Father of Medicine'?",
|
||||
"What's the largest mammal on Earth?",
|
||||
"Which element has the atomic number 79?",
|
||||
"When did the Renaissance period begin?",
|
||||
"What is the currency used in Japan?",
|
||||
"Who proposed the theory of evolution by natural selection?",
|
||||
"Which planet has a day that lasts longer than its year?",
|
||||
"What is the capital of Australia?",
|
||||
"Who painted the Last Supper?",
|
||||
"How do plants get their green color?",
|
||||
"When was the Magna Carta signed?",
|
||||
"What are the building blocks of proteins?",
|
||||
"Which civilization built Machu Picchu?",
|
||||
"What's the most abundant gas in Earth's atmosphere?",
|
||||
"Who translated the Bible into German during the Reformation?",
|
||||
"What causes the tides in the ocean?",
|
||||
"When did the Olympic Games originate?",
|
||||
"What is a black hole?",
|
||||
"Which river is the longest in the world?",
|
||||
"Who invented the telephone?",
|
||||
"When was the French Revolution?",
|
||||
"What is the smallest prime number?",
|
||||
"Which country is known as the Land of the Rising Sun?",
|
||||
"Who composed the Four Seasons?",
|
||||
"What is the periodic table?",
|
||||
"When was the Great Depression?",
|
||||
"What is the primary function of red blood cells?",
|
||||
"Who is known for his laws of motion?",
|
||||
"Which ancient wonder was located in the city of Babylon?",
|
||||
"What are the base pairs in DNA?",
|
||||
"When was the first airplane flight?",
|
||||
"What's the main ingredient in guacamole?",
|
||||
"Which empire was ruled by Suleiman the Magnificent?",
|
||||
"What is the human body's largest organ?",
|
||||
"Who authored 'Brave New World'?",
|
||||
"How does electricity work?",
|
||||
"When did the Cold War end?",
|
||||
"What's the difference between prokaryotic and eukaryotic cells?",
|
||||
"Which mountain range includes Mount Everest?",
|
||||
"Who is the Greek god of war?",
|
||||
"When was the printing press invented?",
|
||||
"What are antibiotics used for?",
|
||||
"Which desert is the driest on Earth?",
|
||||
"Who was the first African American U.S. Supreme Court Justice?",
|
||||
"How many teeth do adult humans typically have?",
|
||||
"Who is the protagonist in 'The Catcher in the Rye'?",
|
||||
"What is the study of fossils called?",
|
||||
]
|
||||
|
||||
|
||||
def generate_random_sentence():
|
||||
word_list = words.words()
|
||||
sentence_length = random.randint(5, 10)
|
||||
sentence = " ".join(random.choices(word_list, k=sentence_length))
|
||||
return sentence
|
||||
|
||||
|
||||
def _query_vespa(query_params: Mapping[str, str | int]) -> list:
|
||||
response = requests.get(
|
||||
SEARCH_ENDPOINT,
|
||||
params=dict(
|
||||
**query_params,
|
||||
**{
|
||||
"presentation.timing": True,
|
||||
},
|
||||
),
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
response_json = response.json()
|
||||
print("timing info", response_json.get("timing"))
|
||||
hits = response_json["root"].get("children", [])
|
||||
|
||||
return hits
|
||||
|
||||
|
||||
def _measure_vespa_latency(filters: dict = {}):
|
||||
# yql = (
|
||||
# VespaIndex.yql_base
|
||||
# + '({grammar: "weakAnd"}userInput(@query) '
|
||||
# + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
# )
|
||||
yql = (
|
||||
f"select "
|
||||
f"documentid, "
|
||||
f"content "
|
||||
f"from {DOCUMENT_INDEX_NAME} where " + '({grammar: "weakAnd"}userInput(@query))'
|
||||
)
|
||||
query = generate_random_sentence()
|
||||
query_embedding = embed_query(query)
|
||||
num_to_retrieve = 50
|
||||
params: dict[str, str | int] = {
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"input.query(query_embedding)": str(query_embedding),
|
||||
"input.query(decay_factor)": str(DOC_TIME_DECAY),
|
||||
"hits": num_to_retrieve,
|
||||
"offset": 0,
|
||||
"ranking.profile": os.environ.get("VESPA_RANKING_PROFILE", "hybrid_search"),
|
||||
"timeout": "10s",
|
||||
}
|
||||
start = time.monotonic()
|
||||
hits = _query_vespa(params)
|
||||
hit_content_len = 0
|
||||
for hit in hits:
|
||||
hit_content_len += len(hit["fields"].get("content", ""))
|
||||
print("Content length", hit_content_len)
|
||||
# print(response)
|
||||
return time.monotonic() - start
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
latencies: list[float] = []
|
||||
for _ in range(50):
|
||||
latencies.append(_measure_vespa_latency())
|
||||
print("Latency", latencies[-1])
|
||||
print(f"Average latency: {sum(latencies) / len(latencies)}")
|
||||
77
backend/scripts/parse_wikipedia.py
Normal file
77
backend/scripts/parse_wikipedia.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import cast
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def slugify(value, allow_unicode=False):
|
||||
"""
|
||||
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
||||
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
|
||||
dashes to single dashes. Remove characters that aren't alphanumerics,
|
||||
underscores, or hyphens. Convert to lowercase. Also strip leading and
|
||||
trailing whitespace, dashes, and underscores.
|
||||
"""
|
||||
value = str(value)
|
||||
if allow_unicode:
|
||||
value = unicodedata.normalize("NFKC", value)
|
||||
else:
|
||||
value = (
|
||||
unicodedata.normalize("NFKD", value)
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii")
|
||||
)
|
||||
value = re.sub(r"[^\w\s-]", "", value.lower())
|
||||
return re.sub(r"[-\s]+", "-", value).strip("-_")
|
||||
|
||||
|
||||
# This function processes the Wikipedia article, which is passed as an 'element'
|
||||
def process_element(element):
|
||||
title = element.findtext("{http://www.mediawiki.org/xml/export-0.10/}title")
|
||||
text = cast(
|
||||
str,
|
||||
element.find("{http://www.mediawiki.org/xml/export-0.10/}revision").findtext(
|
||||
"{http://www.mediawiki.org/xml/export-0.10/}text"
|
||||
),
|
||||
)
|
||||
if text.startswith("#REDIRECT"):
|
||||
print(f"Skipping redirect page: {title}")
|
||||
return 0
|
||||
|
||||
with open(
|
||||
f"/Users/chrisweaver/Downloads/WikipediaProcessedSmall/{slugify(title)}.txt",
|
||||
"w+",
|
||||
) as f:
|
||||
print(f"Writing '{title}'")
|
||||
f.write(f"{title}\n\n{text}")
|
||||
return 1
|
||||
# print(f"Title: {title}")
|
||||
# print(f"Text: {text}") # Print the first 500 characters of the text
|
||||
|
||||
|
||||
# Path to the Wikipedia XML dump
|
||||
file_path = (
|
||||
"/Users/chrisweaver/Downloads/enwiki-20230820-pages-articles-multistream.xml"
|
||||
)
|
||||
|
||||
# Create an iterable XML parser
|
||||
context = etree.iterparse(
|
||||
file_path, tag="{http://www.mediawiki.org/xml/export-0.10/}page", huge_tree=True
|
||||
)
|
||||
|
||||
# Counter for number of pages processed
|
||||
page_counter = 0
|
||||
# Number of pages you want to extract
|
||||
n_pages = 50_000
|
||||
|
||||
pages_written = 0
|
||||
for _, element in context:
|
||||
pages_written += process_element(element)
|
||||
element.clear() # Clear the element to free up memory
|
||||
page_counter += 1
|
||||
if pages_written >= n_pages:
|
||||
break
|
||||
|
||||
# Clean up the XML parser and delete the associated memory
|
||||
del context
|
||||
27
backend/scripts/split_wikipedia.py
Normal file
27
backend/scripts/split_wikipedia.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
batch_size = 50_000
|
||||
base_path = "/Users/chrisweaver/Downloads/WikipediaStuff"
|
||||
wikipedia_path = f"{base_path}/WikipediaProcessed"
|
||||
|
||||
file_names = os.listdir(wikipedia_path)
|
||||
|
||||
dir_num = 0
|
||||
live_cnt = 0
|
||||
for file_name in file_names:
|
||||
if live_cnt == 0:
|
||||
print("Creating batch with number", dir_num)
|
||||
path = f"{base_path}/WikipediaProcessed_{dir_num}"
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
|
||||
shutil.copy(
|
||||
f"{wikipedia_path}/{file_name}",
|
||||
f"{base_path}/WikipediaProcessed_{dir_num}/{file_name}",
|
||||
)
|
||||
live_cnt += 1
|
||||
|
||||
if live_cnt == batch_size:
|
||||
live_cnt = 0
|
||||
dir_num += 1
|
||||
Reference in New Issue
Block a user