Compare commits

...

8 Commits

Author SHA1 Message Date
Richard Kuo (Danswer)
1966127bd4 trivy workaround 2024-10-03 15:25:54 -07:00
rkuo-danswer
3ac84da698 Merge pull request #2676 from danswer-ai/hotfix/v0.7-vespa-delete-performance
hotfix for vespa delete performance
2024-10-03 10:59:32 -07:00
rkuo-danswer
7c7f5b37f5 Merge pull request #2675 from danswer-ai/hotfix/v0.7-bump-celery
bump celery
2024-10-03 10:59:14 -07:00
Richard Kuo (Danswer)
0bf9243891 Merge branch 'release/v0.7' of github.com:danswer-ai/danswer into hotfix/v0.7-bump-celery 2024-10-03 10:21:41 -07:00
Richard Kuo (Danswer)
cfe4bbe3c7 Merge branch 'release/v0.7' of github.com:danswer-ai/danswer into hotfix/v0.7-vespa-delete-performance 2024-10-03 10:21:23 -07:00
Richard Kuo (Danswer)
9d18b92b90 fix sync checks 2024-10-03 10:20:57 -07:00
Richard Kuo (Danswer)
74315e21b3 bump celery 2024-10-03 09:44:25 -07:00
Richard Kuo (Danswer)
f9a5b227a1 hotfix for vespa delete performance 2024-10-03 09:43:02 -07:00
10 changed files with 106 additions and 5 deletions

View File

@@ -46,8 +46,16 @@ jobs:
build-args: |
DANSWER_VERSION=${{ github.ref_name }}
# trivy has their own rate limiting issues causing this action to flake
# we worked around it by hardcoding to different db repos in env
# can re-enable when they figure it out
# https://github.com/aquasecurity/trivy/discussions/7538
# https://github.com/aquasecurity/trivy-action/issues/389
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
env:
TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
with:
# To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}

View File

@@ -40,8 +40,16 @@ jobs:
build-args: |
DANSWER_VERSION=${{ github.ref_name }}
# trivy has their own rate limiting issues causing this action to flake
# we worked around it by hardcoding to different db repos in env
# can re-enable when they figure it out
# https://github.com/aquasecurity/trivy/discussions/7538
# https://github.com/aquasecurity/trivy-action/issues/389
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
env:
TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
with:
image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }}
severity: 'CRITICAL,HIGH'

View File

@@ -113,8 +113,16 @@ jobs:
run: |
docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}
# trivy has their own rate limiting issues causing this action to flake
# we worked around it by hardcoding to different db repos in env
# can re-enable when they figure it out
# https://github.com/aquasecurity/trivy/discussions/7538
# https://github.com/aquasecurity/trivy-action/issues/389
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
env:
TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
with:
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
severity: 'CRITICAL,HIGH'

View File

@@ -3,7 +3,9 @@ name: Python Checks
on:
merge_group:
pull_request:
branches: [ main ]
branches:
- main
- 'release/**'
jobs:
mypy-check:

View File

@@ -3,7 +3,9 @@ name: Python Unit Tests
on:
merge_group:
pull_request:
branches: [ main ]
branches:
- main
- 'release/**'
jobs:
backend-check:

View File

@@ -6,7 +6,9 @@ concurrency:
on:
merge_group:
pull_request:
branches: [ main ]
branches:
- main
- 'release/**'
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

View File

@@ -148,7 +148,7 @@ def document_by_cc_pair_cleanup_task(
if count == 1:
# count == 1 means this is the only remaining cc_pair reference to the doc
# delete it from vespa and the db
document_index.delete(doc_ids=[document_id])
document_index.delete_single(doc_id=document_id)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=[document_id],

View File

@@ -156,6 +156,16 @@ class Deletable(abc.ABC):
Class must implement the ability to delete document by their unique document ids.
"""
@abc.abstractmethod
def delete_single(self, doc_id: str) -> None:
"""
Given a single document id, hard delete it from the document index
Parameters:
- doc_id: document id as specified by the connector
"""
raise NotImplementedError
@abc.abstractmethod
def delete(self, doc_ids: list[str]) -> None:
"""

View File

@@ -13,6 +13,7 @@ from typing import cast
import httpx
import requests
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
from danswer.configs.chat_configs import DOC_TIME_DECAY
from danswer.configs.chat_configs import NUM_RETURNED_HITS
from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
@@ -479,6 +480,66 @@ class VespaIndex(DocumentIndex):
document_ids=doc_ids, index_name=index_name, http_client=http_client
)
def delete_single(self, doc_id: str) -> None:
"""Possibly faster overall than the delete method due to using a single
delete call with a selection query."""
# Vespa deletion is poorly documented ... luckily we found this
# https://docs.vespa.ai/en/operations/batch-delete.html#example
doc_id = replace_invalid_doc_id_characters(doc_id)
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
# indexing / updates / deletes since we have to make a large volume of requests.
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)
with httpx.Client(http2=True) as http_client:
for index_name in index_names:
params = httpx.QueryParams(
{
"selection": f"{index_name}.document_id=='{doc_id}'",
"cluster": DOCUMENT_INDEX_NAME,
}
)
total_chunks_deleted = 0
while True:
try:
resp = http_client.delete(
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
params=params,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to delete chunk, details: {e.response.text}"
)
raise
resp_data = resp.json()
if "documentCount" in resp_data:
chunks_deleted = resp_data["documentCount"]
total_chunks_deleted += chunks_deleted
# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token
if not resp_data["continuation"]:
break # Exit loop if continuation token is empty
params = params.set("continuation", resp_data["continuation"])
logger.debug(
f"VespaIndex.delete_single: "
f"index={index_name} "
f"doc={doc_id} "
f"chunks_deleted={total_chunks_deleted}"
)
def id_based_retrieval(
self,
chunk_requests: list[VespaChunkRequest],

View File

@@ -4,7 +4,7 @@ asyncpg==0.27.0
atlassian-python-api==3.37.0
beautifulsoup4==4.12.2
boto3==1.34.84
celery==5.3.4
celery==5.5.0b4
chardet==5.2.0
dask==2023.8.1
ddtrace==2.6.5