Compare commits

...

45 Commits

Author SHA1 Message Date
Weves
36c30a7d94 fix 2023-11-03 00:45:31 -07:00
Weves
4ba37c255a Fix 2023-11-03 00:36:57 -07:00
Weves
e8eb89f6ed fix 2023-11-03 00:33:29 -07:00
Weves
aacee9b21a fix 2023-11-03 00:28:54 -07:00
Weves
4cc0697b66 Standard 2023-11-03 00:25:42 -07:00
Weves
d5d7772524 Complete 2023-11-03 00:24:48 -07:00
Weves
4326866a9d Add wiki scripts 2023-11-02 21:18:00 -07:00
Weves
609c387e9a test 2023-11-02 18:30:18 -07:00
Weves
b66a0df069 Change 2023-11-02 16:09:20 -07:00
Weves
46ff75e675 Fix 2023-11-02 15:31:42 -07:00
Weves
279a3a02f8 Fix 2023-11-02 15:26:02 -07:00
Weves
1954164ec4 fix 2023-11-02 15:24:21 -07:00
Weves
c6f08cd1cd fix 2023-11-02 15:22:30 -07:00
Weves
d9ada5d290 fix percentiles 2023-11-02 13:44:06 -07:00
Weves
0a1fdb80db Fix 2023-11-02 13:28:22 -07:00
Weves
e94d753821 Add percentiles 2023-11-02 13:23:06 -07:00
Weves
8e81b935d1 Fix 2023-11-02 13:07:56 -07:00
Weves
17866dcf8b tesT 2023-11-01 23:38:53 -07:00
Weves
b427fac6d7 50 2023-11-01 23:29:05 -07:00
Weves
9652af82fe test 2023-11-01 23:22:20 -07:00
Weves
340f703ff5 test 2023-11-01 23:20:55 -07:00
Weves
8de8c00531 test 2023-11-01 23:14:56 -07:00
Weves
d82bd97dd4 Fix 2023-11-01 23:13:20 -07:00
Weves
39341516ab Simplify query 2023-11-01 22:45:08 -07:00
Weves
54f5248eb1 Fix 2023-11-01 22:42:31 -07:00
Weves
97412c9b7a Remove summary 2023-11-01 22:42:31 -07:00
Weves
2f770264c8 Add randomly generated sentences 2023-11-01 22:42:31 -07:00
Weves
8777749646 Disable filters 2023-11-01 22:42:31 -07:00
Weves
22d7427e45 adjust vespa fields 2023-11-01 22:42:31 -07:00
Weves
f4e80c9f22 Configure threads per search 2023-11-01 22:42:31 -07:00
Weves
cb06cf7481 make search profile configurable 2023-11-01 22:42:31 -07:00
Weves
0635a81d5e Back to old bank 2023-11-01 22:42:31 -07:00
Weves
437c5856d0 Add back summary 2023-11-01 22:42:31 -07:00
Weves
1a58ad4276 Add embedding 2023-11-01 22:42:31 -07:00
Weves
a9e0967771 Hyrbrid 2023-11-01 22:42:31 -07:00
Weves
da81e843d2 Remove content summary 2023-11-01 22:42:31 -07:00
Weves
34980974ac Isolate vespa 2023-11-01 22:42:31 -07:00
Weves
61ea59affc Add more questions 2023-11-01 22:42:31 -07:00
Weves
83d5d49323 add logging 2023-11-01 22:42:31 -07:00
Weves
ca397c3121 Add more logging 2023-11-01 22:42:31 -07:00
Weves
d2f3e0165a Adjust num to retrieve 2023-11-01 22:42:31 -07:00
Weves
90398f5e56 Adjust timeout across the board 2023-11-01 22:42:31 -07:00
Weves
7c2c5563e3 Handle empty blurb + adjust timeout 2023-11-01 22:42:31 -07:00
Weves
99df68f5ac Adjust script 2023-11-01 22:42:31 -07:00
Weves
ebec047aa5 Benchmarking script 2023-11-01 22:42:31 -07:00
9 changed files with 554 additions and 5 deletions

View File

@@ -31,6 +31,8 @@ schema danswer_chunk {
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
field source_type type string {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
# Can also index links https://docs.vespa.ai/en/reference/schema-reference.html#attribute
# URL type matching
@@ -61,6 +63,8 @@ schema danswer_chunk {
}
field hidden type bool {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
field metadata type string {
indexing: summary | attribute
@@ -82,10 +86,12 @@ schema danswer_chunk {
}
field access_control_list type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
field document_sets type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}

View File

@@ -25,6 +25,17 @@
<disk>0.75</disk>
</resource-limits>
</tuning>
<engine>
<proton>
<tuning>
<searchnode>
<requestthreads>
<persearch>16</persearch>
</requestthreads>
</searchnode>
</tuning>
</proton>
</engine>
<config name="vespa.config.search.summary.juniperrc">
<max_matches>3</max_matches>
<length>750</length>

View File

@@ -58,6 +58,7 @@ from danswer.search.search_runner import query_processing
from danswer.search.search_runner import remove_stop_words
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
from danswer.utils.timing import log_function_time
logger = setup_logger()
@@ -298,6 +299,8 @@ def _index_vespa_chunks(
def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) -> str:
return ""
def _build_or_filters(key: str, vals: list[str] | None) -> str:
if vals is None:
return ""
@@ -412,7 +415,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
return InferenceChunk(
chunk_id=fields[CHUNK_ID],
blurb=fields[BLURB],
blurb=fields.get(BLURB, ""),
content=fields[CONTENT],
source_links=source_links_dict,
section_continuation=fields[SECTION_CONTINUATION],
@@ -429,13 +432,27 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
)
@log_function_time()
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
if "query" in query_params and not cast(str, query_params["query"]).strip():
raise ValueError("No/empty query received")
response = requests.get(SEARCH_ENDPOINT, params=query_params)
logger.info("Making query with params: %s", query_params)
response = requests.get(
SEARCH_ENDPOINT,
params=dict(
**query_params,
**{
"presentation.timing": True,
},
),
)
response.raise_for_status()
hits = response.json()["root"].get("children", [])
response_json = response.json()
logger.debug("Response: %s", response_json)
logger.info("timing info: %s", response_json.get("timing"))
hits = response_json["root"].get("children", [])
for hit in hits:
if hit["fields"].get(CONTENT) is None:
@@ -447,7 +464,6 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
)
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
return inference_chunks
@@ -467,7 +483,7 @@ class VespaIndex(DocumentIndex):
f"{BOOST}, "
f"{HIDDEN}, "
f"{DOC_UPDATED_AT}, "
f"{METADATA}, "
f"{METADATA} "
f"{CONTENT_SUMMARY} "
f"from {DOCUMENT_INDEX_NAME} where "
)
@@ -617,6 +633,7 @@ class VespaIndex(DocumentIndex):
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "keyword_search",
"timeout": "10s",
}
return _query_vespa(params)
@@ -656,6 +673,7 @@ class VespaIndex(DocumentIndex):
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "semantic_search",
"timeout": "10s",
}
return _query_vespa(params)
@@ -695,6 +713,7 @@ class VespaIndex(DocumentIndex):
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "hybrid_search",
"timeout": "10s",
}
return _query_vespa(params)

View File

@@ -61,6 +61,7 @@ def query_processing(
return query
@log_function_time()
def embed_query(
query: str,
embedding_model: SentenceTransformer | None = None,
@@ -362,6 +363,7 @@ def danswer_search(
search_type=question.search_type,
filters=final_filters,
favor_recent=True if question.favor_recent is None else question.favor_recent,
skip_rerank=question.skip_rerank,
)
ranked_chunks, unranked_chunks = search_chunks(

View File

@@ -198,6 +198,7 @@ class QuestionRequest(BaseModel):
enable_auto_detect_filters: bool
favor_recent: bool | None = None
search_type: SearchType = SearchType.HYBRID
skip_rerank: bool = False
class QAFeedbackRequest(BaseModel):

View File

@@ -0,0 +1,215 @@
import random
import time
import nltk
import requests
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
from danswer.search.models import SearchType
question_bank = [
"Who was the first president of the United States?",
"What is photosynthesis?",
"How long is the Great Wall of China?",
"When was the Eiffel Tower constructed?",
"Who wrote 'Pride and Prejudice'?",
"What's the difference between mitosis and meiosis?",
"What is the capital of Brazil?",
"Who discovered penicillin?",
"What causes the Aurora Borealis?",
"When did the Titanic sink?",
"How does a combustion engine work?",
"Who is the author of 'The Odyssey'?",
"What is quantum physics?",
"When was the Mona Lisa painted?",
"What's the difference between a meteor and a meteorite?",
"Who founded the city of Rome?",
"What is the boiling point of water at sea level?",
"Who won the Nobel Prize in Literature in 1953?",
"How do honeybees produce honey?",
"What is the deepest part of the ocean?",
"When did the first humans arrive in the Americas?",
"What is the Fibonacci sequence?",
"How was the Grand Canyon formed?",
"Who composed the Moonlight Sonata?",
"What are the primary colors of light?",
"When did the Roman Empire fall?",
"How does photosynthesis contribute to the carbon cycle?",
"Who was the first woman in space?",
"What is the Pythagorean theorem?",
"Which planet is known as the 'Red Planet'?",
"Who is the father of modern physics?",
"What is the primary purpose of the United Nations?",
"How old is the Earth?",
"Who wrote 'Don Quixote'?",
"What is the structure of DNA?",
"When was the Declaration of Independence signed?",
"What causes a solar eclipse?",
"Who was the longest-reigning British monarch?",
"How do tornadoes form?",
"Who developed the theory of relativity?",
"What's the tallest mountain on Earth when measured from base to peak?",
"How many bones are there in the adult human body?",
"When was the Internet invented?",
"Who was the ancient Egyptian queen known for her relationship with Roman leaders?",
"What is the Krebs cycle?",
"Which country has the largest land area?",
"Who painted the Starry Night?",
"What's the difference between an alligator and a crocodile?",
"Who discovered the circulation of blood?",
"How many planets are there in our solar system?",
]
additional_questions = [
"Who wrote the play 'Hamlet'?",
"What is the speed of light in a vacuum?",
"When did World War I begin?",
"Who was known as the 'Father of Medicine'?",
"What's the largest mammal on Earth?",
"Which element has the atomic number 79?",
"When did the Renaissance period begin?",
"What is the currency used in Japan?",
"Who proposed the theory of evolution by natural selection?",
"Which planet has a day that lasts longer than its year?",
"What is the capital of Australia?",
"Who painted the Last Supper?",
"How do plants get their green color?",
"When was the Magna Carta signed?",
"What are the building blocks of proteins?",
"Which civilization built Machu Picchu?",
"What's the most abundant gas in Earth's atmosphere?",
"Who translated the Bible into German during the Reformation?",
"What causes the tides in the ocean?",
"When did the Olympic Games originate?",
"What is a black hole?",
"Which river is the longest in the world?",
"Who invented the telephone?",
"When was the French Revolution?",
"What is the smallest prime number?",
"Which country is known as the Land of the Rising Sun?",
"Who composed the Four Seasons?",
"What is the periodic table?",
"When was the Great Depression?",
"What is the primary function of red blood cells?",
"Who is known for his laws of motion?",
"Which ancient wonder was located in the city of Babylon?",
"What are the base pairs in DNA?",
"When was the first airplane flight?",
"What's the main ingredient in guacamole?",
"Which empire was ruled by Suleiman the Magnificent?",
"What is the human body's largest organ?",
"Who authored 'Brave New World'?",
"How does electricity work?",
"When did the Cold War end?",
"What's the difference between prokaryotic and eukaryotic cells?",
"Which mountain range includes Mount Everest?",
"Who is the Greek god of war?",
"When was the printing press invented?",
"What are antibiotics used for?",
"Which desert is the driest on Earth?",
"Who was the first African American U.S. Supreme Court Justice?",
"How many teeth do adult humans typically have?",
"Who is the protagonist in 'The Catcher in the Rye'?",
"What is the study of fossils called?",
]
# Download the wordlist
nltk.download("words")
from nltk.corpus import words # noqa: E402
def generate_random_sentence():
word_list = words.words()
sentence_length = random.randint(5, 10)
sentence = " ".join(random.choices(word_list, k=sentence_length))
return sentence
def _measure_search_latency(
query: str,
search_type: SearchType,
skip_rerank: bool = True,
enable_auto_detect_filters: bool = False,
filters: dict | None = None,
):
start = time.monotonic()
response = requests.post(
"http://localhost:8080/document-search",
json={
"query": query,
"collection": DOCUMENT_INDEX_NAME,
"filters": filters or {},
"enable_auto_detect_filters": enable_auto_detect_filters,
"search_type": search_type,
"skip_rerank": skip_rerank,
},
)
if not response.ok:
raise Exception(f"Failed to search: {response.text}")
return time.monotonic() - start
if __name__ == "__main__":
sentences = question_bank + additional_questions
num_trials = 100
latencies: list[float] = []
for i in range(num_trials):
latencies.append(
_measure_search_latency(query=sentences[i], search_type=SearchType.KEYWORD)
)
print("Latency", latencies[-1])
latencies = sorted(latencies)
print(f"[Keyword] Average latency: {sum(latencies) / len(latencies)}")
print(f"[Keyword] P50: {latencies[int(num_trials * 0.5)]}")
print(f"[Keyword] P95: {latencies[int(num_trials * 0.95)]}")
latencies: list[float] = []
for i in range(num_trials):
latencies.append(
_measure_search_latency(query=sentences[i], search_type=SearchType.HYBRID)
)
print("Latency", latencies[-1])
latencies = sorted(latencies)
print(f"[Hybrid] Average latency: {sum(latencies) / len(latencies)}")
print(f"[Hybrid] P50: {latencies[int(num_trials * 0.5)]}")
print(f"[Hybrid] P95: {latencies[int(num_trials * 0.95)]}")
latencies: list[float] = []
for i in range(num_trials):
latencies.append(
_measure_search_latency(
query=sentences[i],
search_type=SearchType.HYBRID,
skip_rerank=False,
)
)
print("Latency", latencies[-1])
latencies = sorted(latencies)
print(f"[Hybrid + CE] Average latency: {sum(latencies) / len(latencies)}")
print(f"[Hybrid + CE] P50: {latencies[int(num_trials * 0.5)]}")
print(f"[Hybrid + CE] P95: {latencies[int(num_trials * 0.95)]}")
latencies: list[float] = []
for i in range(num_trials):
latencies.append(
_measure_search_latency(
query=sentences[i],
search_type=SearchType.HYBRID,
skip_rerank=False,
enable_auto_detect_filters=True,
)
)
print("Latency", latencies[-1])
latencies = sorted(latencies)
print(f"[Hybrid + CE + filters] Average latency: {sum(latencies) / len(latencies)}")
print(f"[Hybrid + CE + filters] P50: {latencies[int(num_trials * 0.5)]}")
print(f"[Hybrid + CE + filters] P95: {latencies[int(num_trials * 0.95)]}")

View File

@@ -0,0 +1,191 @@
import os
import random
import time
from collections.abc import Mapping
import nltk
import requests
from danswer.configs.app_configs import DOC_TIME_DECAY
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
from danswer.document_index.vespa.index import SEARCH_ENDPOINT
from danswer.search.search_runner import embed_query
# Download the wordlist
nltk.download("words")
from nltk.corpus import words # noqa: E402
question_bank = [
"Who was the first president of the United States?",
"What is photosynthesis?",
"How long is the Great Wall of China?",
"When was the Eiffel Tower constructed?",
"Who wrote 'Pride and Prejudice'?",
"What's the difference between mitosis and meiosis?",
"What is the capital of Brazil?",
"Who discovered penicillin?",
"What causes the Aurora Borealis?",
"When did the Titanic sink?",
"How does a combustion engine work?",
"Who is the author of 'The Odyssey'?",
"What is quantum physics?",
"When was the Mona Lisa painted?",
"What's the difference between a meteor and a meteorite?",
"Who founded the city of Rome?",
"What is the boiling point of water at sea level?",
"Who won the Nobel Prize in Literature in 1953?",
"How do honeybees produce honey?",
"What is the deepest part of the ocean?",
"When did the first humans arrive in the Americas?",
"What is the Fibonacci sequence?",
"How was the Grand Canyon formed?",
"Who composed the Moonlight Sonata?",
"What are the primary colors of light?",
"When did the Roman Empire fall?",
"How does photosynthesis contribute to the carbon cycle?",
"Who was the first woman in space?",
"What is the Pythagorean theorem?",
"Which planet is known as the 'Red Planet'?",
"Who is the father of modern physics?",
"What is the primary purpose of the United Nations?",
"How old is the Earth?",
"Who wrote 'Don Quixote'?",
"What is the structure of DNA?",
"When was the Declaration of Independence signed?",
"What causes a solar eclipse?",
"Who was the longest-reigning British monarch?",
"How do tornadoes form?",
"Who developed the theory of relativity?",
"What's the tallest mountain on Earth when measured from base to peak?",
"How many bones are there in the adult human body?",
"When was the Internet invented?",
"Who was the ancient Egyptian queen known for her relationship with Roman leaders?",
"What is the Krebs cycle?",
"Which country has the largest land area?",
"Who painted the Starry Night?",
"What's the difference between an alligator and a crocodile?",
"Who discovered the circulation of blood?",
"How many planets are there in our solar system?",
]
additional_questions = [
"Who wrote the play 'Hamlet'?",
"What is the speed of light in a vacuum?",
"When did World War I begin?",
"Who was known as the 'Father of Medicine'?",
"What's the largest mammal on Earth?",
"Which element has the atomic number 79?",
"When did the Renaissance period begin?",
"What is the currency used in Japan?",
"Who proposed the theory of evolution by natural selection?",
"Which planet has a day that lasts longer than its year?",
"What is the capital of Australia?",
"Who painted the Last Supper?",
"How do plants get their green color?",
"When was the Magna Carta signed?",
"What are the building blocks of proteins?",
"Which civilization built Machu Picchu?",
"What's the most abundant gas in Earth's atmosphere?",
"Who translated the Bible into German during the Reformation?",
"What causes the tides in the ocean?",
"When did the Olympic Games originate?",
"What is a black hole?",
"Which river is the longest in the world?",
"Who invented the telephone?",
"When was the French Revolution?",
"What is the smallest prime number?",
"Which country is known as the Land of the Rising Sun?",
"Who composed the Four Seasons?",
"What is the periodic table?",
"When was the Great Depression?",
"What is the primary function of red blood cells?",
"Who is known for his laws of motion?",
"Which ancient wonder was located in the city of Babylon?",
"What are the base pairs in DNA?",
"When was the first airplane flight?",
"What's the main ingredient in guacamole?",
"Which empire was ruled by Suleiman the Magnificent?",
"What is the human body's largest organ?",
"Who authored 'Brave New World'?",
"How does electricity work?",
"When did the Cold War end?",
"What's the difference between prokaryotic and eukaryotic cells?",
"Which mountain range includes Mount Everest?",
"Who is the Greek god of war?",
"When was the printing press invented?",
"What are antibiotics used for?",
"Which desert is the driest on Earth?",
"Who was the first African American U.S. Supreme Court Justice?",
"How many teeth do adult humans typically have?",
"Who is the protagonist in 'The Catcher in the Rye'?",
"What is the study of fossils called?",
]
def generate_random_sentence():
word_list = words.words()
sentence_length = random.randint(5, 10)
sentence = " ".join(random.choices(word_list, k=sentence_length))
return sentence
def _query_vespa(query_params: Mapping[str, str | int]) -> list:
response = requests.get(
SEARCH_ENDPOINT,
params=dict(
**query_params,
**{
"presentation.timing": True,
},
),
)
response.raise_for_status()
response_json = response.json()
print("timing info", response_json.get("timing"))
hits = response_json["root"].get("children", [])
return hits
def _measure_vespa_latency(filters: dict = {}):
# yql = (
# VespaIndex.yql_base
# + '({grammar: "weakAnd"}userInput(@query) '
# + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
# )
yql = (
f"select "
f"documentid, "
f"content "
f"from {DOCUMENT_INDEX_NAME} where " + '({grammar: "weakAnd"}userInput(@query))'
)
query = generate_random_sentence()
query_embedding = embed_query(query)
num_to_retrieve = 50
params: dict[str, str | int] = {
"yql": yql,
"query": query,
"input.query(query_embedding)": str(query_embedding),
"input.query(decay_factor)": str(DOC_TIME_DECAY),
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": os.environ.get("VESPA_RANKING_PROFILE", "hybrid_search"),
"timeout": "10s",
}
start = time.monotonic()
hits = _query_vespa(params)
hit_content_len = 0
for hit in hits:
hit_content_len += len(hit["fields"].get("content", ""))
print("Content length", hit_content_len)
# print(response)
return time.monotonic() - start
if __name__ == "__main__":
latencies: list[float] = []
for _ in range(50):
latencies.append(_measure_vespa_latency())
print("Latency", latencies[-1])
print(f"Average latency: {sum(latencies) / len(latencies)}")

View File

@@ -0,0 +1,77 @@
import re
import unicodedata
from typing import cast
from lxml import etree
def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize("NFKC", value)
else:
value = (
unicodedata.normalize("NFKD", value)
.encode("ascii", "ignore")
.decode("ascii")
)
value = re.sub(r"[^\w\s-]", "", value.lower())
return re.sub(r"[-\s]+", "-", value).strip("-_")
# This function processes the Wikipedia article, which is passed as an 'element'
def process_element(element):
title = element.findtext("{http://www.mediawiki.org/xml/export-0.10/}title")
text = cast(
str,
element.find("{http://www.mediawiki.org/xml/export-0.10/}revision").findtext(
"{http://www.mediawiki.org/xml/export-0.10/}text"
),
)
if text.startswith("#REDIRECT"):
print(f"Skipping redirect page: {title}")
return 0
with open(
f"/Users/chrisweaver/Downloads/WikipediaProcessedSmall/{slugify(title)}.txt",
"w+",
) as f:
print(f"Writing '{title}'")
f.write(f"{title}\n\n{text}")
return 1
# print(f"Title: {title}")
# print(f"Text: {text}") # Print the first 500 characters of the text
# Path to the Wikipedia XML dump
file_path = (
"/Users/chrisweaver/Downloads/enwiki-20230820-pages-articles-multistream.xml"
)
# Create an iterable XML parser
context = etree.iterparse(
file_path, tag="{http://www.mediawiki.org/xml/export-0.10/}page", huge_tree=True
)
# Counter for number of pages processed
page_counter = 0
# Number of pages you want to extract
n_pages = 50_000
pages_written = 0
for _, element in context:
pages_written += process_element(element)
element.clear() # Clear the element to free up memory
page_counter += 1
if pages_written >= n_pages:
break
# Clean up the XML parser and delete the associated memory
del context

View File

@@ -0,0 +1,27 @@
import os
import shutil
batch_size = 50_000
base_path = "/Users/chrisweaver/Downloads/WikipediaStuff"
wikipedia_path = f"{base_path}/WikipediaProcessed"
file_names = os.listdir(wikipedia_path)
dir_num = 0
live_cnt = 0
for file_name in file_names:
if live_cnt == 0:
print("Creating batch with number", dir_num)
path = f"{base_path}/WikipediaProcessed_{dir_num}"
if not os.path.exists(path):
os.mkdir(path)
shutil.copy(
f"{wikipedia_path}/{file_name}",
f"{base_path}/WikipediaProcessed_{dir_num}/{file_name}",
)
live_cnt += 1
if live_cnt == batch_size:
live_cnt = 0
dir_num += 1