fix

Fix
fix
2026-02-26 12:15:48 +00:00 · 2023-11-03 00:45:31 -07:00 · 2023-11-03 00:36:57 -07:00 · 2023-11-03 00:33:29 -07:00 · 2023-11-03 00:28:54 -07:00 · 2023-11-03 00:25:42 -07:00
9 changed files with 554 additions and 5 deletions
--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -31,6 +31,8 @@ schema danswer_chunk {
        # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
        field source_type type string {
            indexing: summary | attribute
+            rank: filter
+            attribute: fast-search
        }
        # Can also index links https://docs.vespa.ai/en/reference/schema-reference.html#attribute
        # URL type matching
@@ -61,6 +63,8 @@ schema danswer_chunk {
        }
        field hidden type bool {
            indexing: summary | attribute
+            rank: filter
+            attribute: fast-search
        }
        field metadata type string {
            indexing: summary | attribute
@@ -82,10 +86,12 @@ schema danswer_chunk {
        }
        field access_control_list type weightedset<string> {
            indexing: summary | attribute
+            rank: filter
            attribute: fast-search
        }
        field document_sets type weightedset<string> {
            indexing: summary | attribute
+            rank: filter
            attribute: fast-search
        }
    }
--- a/backend/danswer/document_index/vespa/app_config/services.xml
+++ b/backend/danswer/document_index/vespa/app_config/services.xml
@@ -25,6 +25,17 @@
                <disk>0.75</disk>
            </resource-limits>
        </tuning>
+        <engine>    
+            <proton>
+                <tuning>
+                    <searchnode>
+                        <requestthreads>
+                            <persearch>16</persearch>
+                        </requestthreads>
+                    </searchnode>
+                </tuning>
+            </proton>
+        </engine>
        <config name="vespa.config.search.summary.juniperrc">
            <max_matches>3</max_matches>
            <length>750</length>
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -58,6 +58,7 @@ from danswer.search.search_runner import query_processing
 from danswer.search.search_runner import remove_stop_words
 from danswer.utils.batching import batch_generator
 from danswer.utils.logger import setup_logger
+from danswer.utils.timing import log_function_time

 logger = setup_logger()

@@ -298,6 +299,8 @@ def _index_vespa_chunks(


 def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) -> str:
+    return ""
+
    def _build_or_filters(key: str, vals: list[str] | None) -> str:
        if vals is None:
            return ""
@@ -412,7 +415,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:

    return InferenceChunk(
        chunk_id=fields[CHUNK_ID],
-        blurb=fields[BLURB],
+        blurb=fields.get(BLURB, ""),
        content=fields[CONTENT],
        source_links=source_links_dict,
        section_continuation=fields[SECTION_CONTINUATION],
@@ -429,13 +432,27 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
    )


+@log_function_time()
 def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
    if "query" in query_params and not cast(str, query_params["query"]).strip():
        raise ValueError("No/empty query received")
-    response = requests.get(SEARCH_ENDPOINT, params=query_params)
+
+    logger.info("Making query with params: %s", query_params)
+    response = requests.get(
+        SEARCH_ENDPOINT,
+        params=dict(
+            **query_params,
+            **{
+                "presentation.timing": True,
+            },
+        ),
+    )
    response.raise_for_status()

-    hits = response.json()["root"].get("children", [])
+    response_json = response.json()
+    logger.debug("Response: %s", response_json)
+    logger.info("timing info: %s", response_json.get("timing"))
+    hits = response_json["root"].get("children", [])

    for hit in hits:
        if hit["fields"].get(CONTENT) is None:
@@ -447,7 +464,6 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
            )

    filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
-
    inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
    return inference_chunks

@@ -467,7 +483,7 @@ class VespaIndex(DocumentIndex):
        f"{BOOST}, "
        f"{HIDDEN}, "
        f"{DOC_UPDATED_AT}, "
-        f"{METADATA}, "
+        f"{METADATA} "
        f"{CONTENT_SUMMARY} "
        f"from {DOCUMENT_INDEX_NAME} where "
    )
@@ -617,6 +633,7 @@ class VespaIndex(DocumentIndex):
            "hits": num_to_retrieve,
            "offset": 0,
            "ranking.profile": "keyword_search",
+            "timeout": "10s",
        }

        return _query_vespa(params)
@@ -656,6 +673,7 @@ class VespaIndex(DocumentIndex):
            "hits": num_to_retrieve,
            "offset": 0,
            "ranking.profile": "semantic_search",
+            "timeout": "10s",
        }

        return _query_vespa(params)
@@ -695,6 +713,7 @@ class VespaIndex(DocumentIndex):
            "hits": num_to_retrieve,
            "offset": 0,
            "ranking.profile": "hybrid_search",
+            "timeout": "10s",
        }

        return _query_vespa(params)
--- a/backend/danswer/search/search_runner.py
+++ b/backend/danswer/search/search_runner.py
@@ -61,6 +61,7 @@ def query_processing(
    return query


+@log_function_time()
 def embed_query(
    query: str,
    embedding_model: SentenceTransformer | None = None,
@@ -362,6 +363,7 @@ def danswer_search(
        search_type=question.search_type,
        filters=final_filters,
        favor_recent=True if question.favor_recent is None else question.favor_recent,
+        skip_rerank=question.skip_rerank,
    )

    ranked_chunks, unranked_chunks = search_chunks(
--- a/backend/danswer/server/models.py
+++ b/backend/danswer/server/models.py
@@ -198,6 +198,7 @@ class QuestionRequest(BaseModel):
    enable_auto_detect_filters: bool
    favor_recent: bool | None = None
    search_type: SearchType = SearchType.HYBRID
+    skip_rerank: bool = False


 class QAFeedbackRequest(BaseModel):
--- a/backend/scripts/benchmark_search.py
+++ b/backend/scripts/benchmark_search.py
@@ -0,0 +1,215 @@
+import random
+import time
+
+import nltk
+import requests
+
+from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
+from danswer.search.models import SearchType
+
+question_bank = [
+    "Who was the first president of the United States?",
+    "What is photosynthesis?",
+    "How long is the Great Wall of China?",
+    "When was the Eiffel Tower constructed?",
+    "Who wrote 'Pride and Prejudice'?",
+    "What's the difference between mitosis and meiosis?",
+    "What is the capital of Brazil?",
+    "Who discovered penicillin?",
+    "What causes the Aurora Borealis?",
+    "When did the Titanic sink?",
+    "How does a combustion engine work?",
+    "Who is the author of 'The Odyssey'?",
+    "What is quantum physics?",
+    "When was the Mona Lisa painted?",
+    "What's the difference between a meteor and a meteorite?",
+    "Who founded the city of Rome?",
+    "What is the boiling point of water at sea level?",
+    "Who won the Nobel Prize in Literature in 1953?",
+    "How do honeybees produce honey?",
+    "What is the deepest part of the ocean?",
+    "When did the first humans arrive in the Americas?",
+    "What is the Fibonacci sequence?",
+    "How was the Grand Canyon formed?",
+    "Who composed the Moonlight Sonata?",
+    "What are the primary colors of light?",
+    "When did the Roman Empire fall?",
+    "How does photosynthesis contribute to the carbon cycle?",
+    "Who was the first woman in space?",
+    "What is the Pythagorean theorem?",
+    "Which planet is known as the 'Red Planet'?",
+    "Who is the father of modern physics?",
+    "What is the primary purpose of the United Nations?",
+    "How old is the Earth?",
+    "Who wrote 'Don Quixote'?",
+    "What is the structure of DNA?",
+    "When was the Declaration of Independence signed?",
+    "What causes a solar eclipse?",
+    "Who was the longest-reigning British monarch?",
+    "How do tornadoes form?",
+    "Who developed the theory of relativity?",
+    "What's the tallest mountain on Earth when measured from base to peak?",
+    "How many bones are there in the adult human body?",
+    "When was the Internet invented?",
+    "Who was the ancient Egyptian queen known for her relationship with Roman leaders?",
+    "What is the Krebs cycle?",
+    "Which country has the largest land area?",
+    "Who painted the Starry Night?",
+    "What's the difference between an alligator and a crocodile?",
+    "Who discovered the circulation of blood?",
+    "How many planets are there in our solar system?",
+]
+
+additional_questions = [
+    "Who wrote the play 'Hamlet'?",
+    "What is the speed of light in a vacuum?",
+    "When did World War I begin?",
+    "Who was known as the 'Father of Medicine'?",
+    "What's the largest mammal on Earth?",
+    "Which element has the atomic number 79?",
+    "When did the Renaissance period begin?",
+    "What is the currency used in Japan?",
+    "Who proposed the theory of evolution by natural selection?",
+    "Which planet has a day that lasts longer than its year?",
+    "What is the capital of Australia?",
+    "Who painted the Last Supper?",
+    "How do plants get their green color?",
+    "When was the Magna Carta signed?",
+    "What are the building blocks of proteins?",
+    "Which civilization built Machu Picchu?",
+    "What's the most abundant gas in Earth's atmosphere?",
+    "Who translated the Bible into German during the Reformation?",
+    "What causes the tides in the ocean?",
+    "When did the Olympic Games originate?",
+    "What is a black hole?",
+    "Which river is the longest in the world?",
+    "Who invented the telephone?",
+    "When was the French Revolution?",
+    "What is the smallest prime number?",
+    "Which country is known as the Land of the Rising Sun?",
+    "Who composed the Four Seasons?",
+    "What is the periodic table?",
+    "When was the Great Depression?",
+    "What is the primary function of red blood cells?",
+    "Who is known for his laws of motion?",
+    "Which ancient wonder was located in the city of Babylon?",
+    "What are the base pairs in DNA?",
+    "When was the first airplane flight?",
+    "What's the main ingredient in guacamole?",
+    "Which empire was ruled by Suleiman the Magnificent?",
+    "What is the human body's largest organ?",
+    "Who authored 'Brave New World'?",
+    "How does electricity work?",
+    "When did the Cold War end?",
+    "What's the difference between prokaryotic and eukaryotic cells?",
+    "Which mountain range includes Mount Everest?",
+    "Who is the Greek god of war?",
+    "When was the printing press invented?",
+    "What are antibiotics used for?",
+    "Which desert is the driest on Earth?",
+    "Who was the first African American U.S. Supreme Court Justice?",
+    "How many teeth do adult humans typically have?",
+    "Who is the protagonist in 'The Catcher in the Rye'?",
+    "What is the study of fossils called?",
+]
+
+# Download the wordlist
+nltk.download("words")
+from nltk.corpus import words  # noqa: E402
+
+
+def generate_random_sentence():
+    word_list = words.words()
+    sentence_length = random.randint(5, 10)
+    sentence = " ".join(random.choices(word_list, k=sentence_length))
+    return sentence
+
+
+def _measure_search_latency(
+    query: str,
+    search_type: SearchType,
+    skip_rerank: bool = True,
+    enable_auto_detect_filters: bool = False,
+    filters: dict | None = None,
+):
+    start = time.monotonic()
+    response = requests.post(
+        "http://localhost:8080/document-search",
+        json={
+            "query": query,
+            "collection": DOCUMENT_INDEX_NAME,
+            "filters": filters or {},
+            "enable_auto_detect_filters": enable_auto_detect_filters,
+            "search_type": search_type,
+            "skip_rerank": skip_rerank,
+        },
+    )
+    if not response.ok:
+        raise Exception(f"Failed to search: {response.text}")
+    return time.monotonic() - start
+
+
+if __name__ == "__main__":
+    sentences = question_bank + additional_questions
+    num_trials = 100
+
+    latencies: list[float] = []
+    for i in range(num_trials):
+        latencies.append(
+            _measure_search_latency(query=sentences[i], search_type=SearchType.KEYWORD)
+        )
+        print("Latency", latencies[-1])
+
+    latencies = sorted(latencies)
+
+    print(f"[Keyword] Average latency: {sum(latencies) / len(latencies)}")
+    print(f"[Keyword] P50: {latencies[int(num_trials * 0.5)]}")
+    print(f"[Keyword] P95: {latencies[int(num_trials * 0.95)]}")
+
+    latencies: list[float] = []
+    for i in range(num_trials):
+        latencies.append(
+            _measure_search_latency(query=sentences[i], search_type=SearchType.HYBRID)
+        )
+        print("Latency", latencies[-1])
+
+    latencies = sorted(latencies)
+
+    print(f"[Hybrid] Average latency: {sum(latencies) / len(latencies)}")
+    print(f"[Hybrid] P50: {latencies[int(num_trials * 0.5)]}")
+    print(f"[Hybrid] P95: {latencies[int(num_trials * 0.95)]}")
+
+    latencies: list[float] = []
+    for i in range(num_trials):
+        latencies.append(
+            _measure_search_latency(
+                query=sentences[i],
+                search_type=SearchType.HYBRID,
+                skip_rerank=False,
+            )
+        )
+        print("Latency", latencies[-1])
+
+    latencies = sorted(latencies)
+
+    print(f"[Hybrid + CE] Average latency: {sum(latencies) / len(latencies)}")
+    print(f"[Hybrid + CE] P50: {latencies[int(num_trials * 0.5)]}")
+    print(f"[Hybrid + CE] P95: {latencies[int(num_trials * 0.95)]}")
+
+    latencies: list[float] = []
+    for i in range(num_trials):
+        latencies.append(
+            _measure_search_latency(
+                query=sentences[i],
+                search_type=SearchType.HYBRID,
+                skip_rerank=False,
+                enable_auto_detect_filters=True,
+            )
+        )
+        print("Latency", latencies[-1])
+
+    latencies = sorted(latencies)
+
+    print(f"[Hybrid + CE + filters] Average latency: {sum(latencies) / len(latencies)}")
+    print(f"[Hybrid + CE + filters] P50: {latencies[int(num_trials * 0.5)]}")
+    print(f"[Hybrid + CE + filters] P95: {latencies[int(num_trials * 0.95)]}")
--- a/backend/scripts/benchmark_search_isolated.py
+++ b/backend/scripts/benchmark_search_isolated.py
@@ -0,0 +1,191 @@
+import os
+import random
+import time
+from collections.abc import Mapping
+
+import nltk
+import requests
+
+from danswer.configs.app_configs import DOC_TIME_DECAY
+from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
+from danswer.document_index.vespa.index import SEARCH_ENDPOINT
+from danswer.search.search_runner import embed_query
+
+# Download the wordlist
+nltk.download("words")
+from nltk.corpus import words  # noqa: E402
+
+question_bank = [
+    "Who was the first president of the United States?",
+    "What is photosynthesis?",
+    "How long is the Great Wall of China?",
+    "When was the Eiffel Tower constructed?",
+    "Who wrote 'Pride and Prejudice'?",
+    "What's the difference between mitosis and meiosis?",
+    "What is the capital of Brazil?",
+    "Who discovered penicillin?",
+    "What causes the Aurora Borealis?",
+    "When did the Titanic sink?",
+    "How does a combustion engine work?",
+    "Who is the author of 'The Odyssey'?",
+    "What is quantum physics?",
+    "When was the Mona Lisa painted?",
+    "What's the difference between a meteor and a meteorite?",
+    "Who founded the city of Rome?",
+    "What is the boiling point of water at sea level?",
+    "Who won the Nobel Prize in Literature in 1953?",
+    "How do honeybees produce honey?",
+    "What is the deepest part of the ocean?",
+    "When did the first humans arrive in the Americas?",
+    "What is the Fibonacci sequence?",
+    "How was the Grand Canyon formed?",
+    "Who composed the Moonlight Sonata?",
+    "What are the primary colors of light?",
+    "When did the Roman Empire fall?",
+    "How does photosynthesis contribute to the carbon cycle?",
+    "Who was the first woman in space?",
+    "What is the Pythagorean theorem?",
+    "Which planet is known as the 'Red Planet'?",
+    "Who is the father of modern physics?",
+    "What is the primary purpose of the United Nations?",
+    "How old is the Earth?",
+    "Who wrote 'Don Quixote'?",
+    "What is the structure of DNA?",
+    "When was the Declaration of Independence signed?",
+    "What causes a solar eclipse?",
+    "Who was the longest-reigning British monarch?",
+    "How do tornadoes form?",
+    "Who developed the theory of relativity?",
+    "What's the tallest mountain on Earth when measured from base to peak?",
+    "How many bones are there in the adult human body?",
+    "When was the Internet invented?",
+    "Who was the ancient Egyptian queen known for her relationship with Roman leaders?",
+    "What is the Krebs cycle?",
+    "Which country has the largest land area?",
+    "Who painted the Starry Night?",
+    "What's the difference between an alligator and a crocodile?",
+    "Who discovered the circulation of blood?",
+    "How many planets are there in our solar system?",
+]
+
+additional_questions = [
+    "Who wrote the play 'Hamlet'?",
+    "What is the speed of light in a vacuum?",
+    "When did World War I begin?",
+    "Who was known as the 'Father of Medicine'?",
+    "What's the largest mammal on Earth?",
+    "Which element has the atomic number 79?",
+    "When did the Renaissance period begin?",
+    "What is the currency used in Japan?",
+    "Who proposed the theory of evolution by natural selection?",
+    "Which planet has a day that lasts longer than its year?",
+    "What is the capital of Australia?",
+    "Who painted the Last Supper?",
+    "How do plants get their green color?",
+    "When was the Magna Carta signed?",
+    "What are the building blocks of proteins?",
+    "Which civilization built Machu Picchu?",
+    "What's the most abundant gas in Earth's atmosphere?",
+    "Who translated the Bible into German during the Reformation?",
+    "What causes the tides in the ocean?",
+    "When did the Olympic Games originate?",
+    "What is a black hole?",
+    "Which river is the longest in the world?",
+    "Who invented the telephone?",
+    "When was the French Revolution?",
+    "What is the smallest prime number?",
+    "Which country is known as the Land of the Rising Sun?",
+    "Who composed the Four Seasons?",
+    "What is the periodic table?",
+    "When was the Great Depression?",
+    "What is the primary function of red blood cells?",
+    "Who is known for his laws of motion?",
+    "Which ancient wonder was located in the city of Babylon?",
+    "What are the base pairs in DNA?",
+    "When was the first airplane flight?",
+    "What's the main ingredient in guacamole?",
+    "Which empire was ruled by Suleiman the Magnificent?",
+    "What is the human body's largest organ?",
+    "Who authored 'Brave New World'?",
+    "How does electricity work?",
+    "When did the Cold War end?",
+    "What's the difference between prokaryotic and eukaryotic cells?",
+    "Which mountain range includes Mount Everest?",
+    "Who is the Greek god of war?",
+    "When was the printing press invented?",
+    "What are antibiotics used for?",
+    "Which desert is the driest on Earth?",
+    "Who was the first African American U.S. Supreme Court Justice?",
+    "How many teeth do adult humans typically have?",
+    "Who is the protagonist in 'The Catcher in the Rye'?",
+    "What is the study of fossils called?",
+]
+
+
+def generate_random_sentence():
+    word_list = words.words()
+    sentence_length = random.randint(5, 10)
+    sentence = " ".join(random.choices(word_list, k=sentence_length))
+    return sentence
+
+
+def _query_vespa(query_params: Mapping[str, str | int]) -> list:
+    response = requests.get(
+        SEARCH_ENDPOINT,
+        params=dict(
+            **query_params,
+            **{
+                "presentation.timing": True,
+            },
+        ),
+    )
+    response.raise_for_status()
+
+    response_json = response.json()
+    print("timing info", response_json.get("timing"))
+    hits = response_json["root"].get("children", [])
+
+    return hits
+
+
+def _measure_vespa_latency(filters: dict = {}):
+    # yql = (
+    #     VespaIndex.yql_base
+    #     + '({grammar: "weakAnd"}userInput(@query) '
+    #     + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+    # )
+    yql = (
+        f"select "
+        f"documentid, "
+        f"content "
+        f"from {DOCUMENT_INDEX_NAME} where " + '({grammar: "weakAnd"}userInput(@query))'
+    )
+    query = generate_random_sentence()
+    query_embedding = embed_query(query)
+    num_to_retrieve = 50
+    params: dict[str, str | int] = {
+        "yql": yql,
+        "query": query,
+        "input.query(query_embedding)": str(query_embedding),
+        "input.query(decay_factor)": str(DOC_TIME_DECAY),
+        "hits": num_to_retrieve,
+        "offset": 0,
+        "ranking.profile": os.environ.get("VESPA_RANKING_PROFILE", "hybrid_search"),
+        "timeout": "10s",
+    }
+    start = time.monotonic()
+    hits = _query_vespa(params)
+    hit_content_len = 0
+    for hit in hits:
+        hit_content_len += len(hit["fields"].get("content", ""))
+    print("Content length", hit_content_len)
+    # print(response)
+    return time.monotonic() - start
+
+
+if __name__ == "__main__":
+    latencies: list[float] = []
+    for _ in range(50):
+        latencies.append(_measure_vespa_latency())
+        print("Latency", latencies[-1])
+    print(f"Average latency: {sum(latencies) / len(latencies)}")
--- a/backend/scripts/parse_wikipedia.py
+++ b/backend/scripts/parse_wikipedia.py
@@ -0,0 +1,77 @@
+import re
+import unicodedata
+from typing import cast
+
+from lxml import etree
+
+
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize("NFKC", value)
+    else:
+        value = (
+            unicodedata.normalize("NFKD", value)
+            .encode("ascii", "ignore")
+            .decode("ascii")
+        )
+    value = re.sub(r"[^\w\s-]", "", value.lower())
+    return re.sub(r"[-\s]+", "-", value).strip("-_")
+
+
+# This function processes the Wikipedia article, which is passed as an 'element'
+def process_element(element):
+    title = element.findtext("{http://www.mediawiki.org/xml/export-0.10/}title")
+    text = cast(
+        str,
+        element.find("{http://www.mediawiki.org/xml/export-0.10/}revision").findtext(
+            "{http://www.mediawiki.org/xml/export-0.10/}text"
+        ),
+    )
+    if text.startswith("#REDIRECT"):
+        print(f"Skipping redirect page: {title}")
+        return 0
+
+    with open(
+        f"/Users/chrisweaver/Downloads/WikipediaProcessedSmall/{slugify(title)}.txt",
+        "w+",
+    ) as f:
+        print(f"Writing '{title}'")
+        f.write(f"{title}\n\n{text}")
+    return 1
+    # print(f"Title: {title}")
+    # print(f"Text: {text}")  # Print the first 500 characters of the text
+
+
+# Path to the Wikipedia XML dump
+file_path = (
+    "/Users/chrisweaver/Downloads/enwiki-20230820-pages-articles-multistream.xml"
+)
+
+# Create an iterable XML parser
+context = etree.iterparse(
+    file_path, tag="{http://www.mediawiki.org/xml/export-0.10/}page", huge_tree=True
+)
+
+# Counter for number of pages processed
+page_counter = 0
+# Number of pages you want to extract
+n_pages = 50_000
+
+pages_written = 0
+for _, element in context:
+    pages_written += process_element(element)
+    element.clear()  # Clear the element to free up memory
+    page_counter += 1
+    if pages_written >= n_pages:
+        break
+
+# Clean up the XML parser and delete the associated memory
+del context
--- a/backend/scripts/split_wikipedia.py
+++ b/backend/scripts/split_wikipedia.py
@@ -0,0 +1,27 @@
+import os
+import shutil
+
+batch_size = 50_000
+base_path = "/Users/chrisweaver/Downloads/WikipediaStuff"
+wikipedia_path = f"{base_path}/WikipediaProcessed"
+
+file_names = os.listdir(wikipedia_path)
+
+dir_num = 0
+live_cnt = 0
+for file_name in file_names:
+    if live_cnt == 0:
+        print("Creating batch with number", dir_num)
+        path = f"{base_path}/WikipediaProcessed_{dir_num}"
+        if not os.path.exists(path):
+            os.mkdir(path)
+
+    shutil.copy(
+        f"{wikipedia_path}/{file_name}",
+        f"{base_path}/WikipediaProcessed_{dir_num}/{file_name}",
+    )
+    live_cnt += 1
+
+    if live_cnt == batch_size:
+        live_cnt = 0
+        dir_num += 1
Author	SHA1	Message	Date
Weves	36c30a7d94	fix	2023-11-03 00:45:31 -07:00
Weves	4ba37c255a	Fix	2023-11-03 00:36:57 -07:00
Weves	e8eb89f6ed	fix	2023-11-03 00:33:29 -07:00
Weves	aacee9b21a	fix	2023-11-03 00:28:54 -07:00
Weves	4cc0697b66	Standard	2023-11-03 00:25:42 -07:00
Weves	d5d7772524	Complete	2023-11-03 00:24:48 -07:00
Weves	4326866a9d	Add wiki scripts	2023-11-02 21:18:00 -07:00
Weves	609c387e9a	test	2023-11-02 18:30:18 -07:00
Weves	b66a0df069	Change	2023-11-02 16:09:20 -07:00
Weves	46ff75e675	Fix	2023-11-02 15:31:42 -07:00
Weves	279a3a02f8	Fix	2023-11-02 15:26:02 -07:00
Weves	1954164ec4	fix	2023-11-02 15:24:21 -07:00
Weves	c6f08cd1cd	fix	2023-11-02 15:22:30 -07:00
Weves	d9ada5d290	fix percentiles	2023-11-02 13:44:06 -07:00
Weves	0a1fdb80db	Fix	2023-11-02 13:28:22 -07:00
Weves	e94d753821	Add percentiles	2023-11-02 13:23:06 -07:00
Weves	8e81b935d1	Fix	2023-11-02 13:07:56 -07:00
Weves	17866dcf8b	tesT	2023-11-01 23:38:53 -07:00
Weves	b427fac6d7	50	2023-11-01 23:29:05 -07:00
Weves	9652af82fe	test	2023-11-01 23:22:20 -07:00
Weves	340f703ff5	test	2023-11-01 23:20:55 -07:00
Weves	8de8c00531	test	2023-11-01 23:14:56 -07:00
Weves	d82bd97dd4	Fix	2023-11-01 23:13:20 -07:00
Weves	39341516ab	Simplify query	2023-11-01 22:45:08 -07:00
Weves	54f5248eb1	Fix	2023-11-01 22:42:31 -07:00
Weves	97412c9b7a	Remove summary	2023-11-01 22:42:31 -07:00
Weves	2f770264c8	Add randomly generated sentences	2023-11-01 22:42:31 -07:00
Weves	8777749646	Disable filters	2023-11-01 22:42:31 -07:00
Weves	22d7427e45	adjust vespa fields	2023-11-01 22:42:31 -07:00
Weves	f4e80c9f22	Configure threads per search	2023-11-01 22:42:31 -07:00
Weves	cb06cf7481	make search profile configurable	2023-11-01 22:42:31 -07:00
Weves	0635a81d5e	Back to old bank	2023-11-01 22:42:31 -07:00
Weves	437c5856d0	Add back summary	2023-11-01 22:42:31 -07:00
Weves	1a58ad4276	Add embedding	2023-11-01 22:42:31 -07:00
Weves	a9e0967771	Hyrbrid	2023-11-01 22:42:31 -07:00
Weves	da81e843d2	Remove content summary	2023-11-01 22:42:31 -07:00
Weves	34980974ac	Isolate vespa	2023-11-01 22:42:31 -07:00
Weves	61ea59affc	Add more questions	2023-11-01 22:42:31 -07:00
Weves	83d5d49323	add logging	2023-11-01 22:42:31 -07:00
Weves	ca397c3121	Add more logging	2023-11-01 22:42:31 -07:00
Weves	d2f3e0165a	Adjust num to retrieve	2023-11-01 22:42:31 -07:00
Weves	90398f5e56	Adjust timeout across the board	2023-11-01 22:42:31 -07:00
Weves	7c2c5563e3	Handle empty blurb + adjust timeout	2023-11-01 22:42:31 -07:00
Weves	99df68f5ac	Adjust script	2023-11-01 22:42:31 -07:00
Weves	ebec047aa5	Benchmarking script	2023-11-01 22:42:31 -07:00