trivy workaround

Merge pull request #2676 from danswer-ai/hotfix/v0.7-vespa-delete-performance
hotfix for vespa delete performance
2026-02-25 11:45:47 +00:00 · 2024-10-03 15:25:54 -07:00 · 2024-10-03 10:59:32 -07:00 · 2024-10-03 10:59:14 -07:00 · 2024-10-03 10:21:41 -07:00 · 2024-10-03 10:21:23 -07:00
10 changed files with 106 additions and 5 deletions
--- a/.github/workflows/docker-build-push-backend-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml
@@ -46,8 +46,16 @@ jobs:
        build-args: |
          DANSWER_VERSION=${{ github.ref_name }}

+    # trivy has their own rate limiting issues causing this action to flake
+    # we worked around it by hardcoding to different db repos in env
+    # can re-enable when they figure it out
+    # https://github.com/aquasecurity/trivy/discussions/7538
+    # https://github.com/aquasecurity/trivy-action/issues/389
    - name: Run Trivy vulnerability scanner
      uses: aquasecurity/trivy-action@master
+      env:
+        TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
+        TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
      with:
        # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
        image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
--- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -40,8 +40,16 @@ jobs:
        build-args: |
          DANSWER_VERSION=${{ github.ref_name }}

+    # trivy has their own rate limiting issues causing this action to flake
+    # we worked around it by hardcoding to different db repos in env
+    # can re-enable when they figure it out
+    # https://github.com/aquasecurity/trivy/discussions/7538
+    # https://github.com/aquasecurity/trivy-action/issues/389
    - name: Run Trivy vulnerability scanner
      uses: aquasecurity/trivy-action@master
+      env:
+        TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
+        TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
      with:
        image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }}
        severity: 'CRITICAL,HIGH'
--- a/.github/workflows/docker-build-push-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-web-container-on-tag.yml
@@ -113,8 +113,16 @@ jobs:
        run: |
          docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}

+    # trivy has their own rate limiting issues causing this action to flake
+    # we worked around it by hardcoding to different db repos in env
+    # can re-enable when they figure it out
+    # https://github.com/aquasecurity/trivy/discussions/7538
+    # https://github.com/aquasecurity/trivy-action/issues/389
      - name: Run Trivy vulnerability scanner
        uses: aquasecurity/trivy-action@master
+        env:
+          TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
+          TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: 'CRITICAL,HIGH'
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -3,7 +3,9 @@ name: Python Checks
 on:
  merge_group:
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release/**'

 jobs:
  mypy-check:
--- a/.github/workflows/pr-python-tests.yml
+++ b/.github/workflows/pr-python-tests.yml
@@ -3,7 +3,9 @@ name: Python Unit Tests
 on:
  merge_group:
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release/**'

 jobs:
  backend-check:
--- a/.github/workflows/run-it.yml
+++ b/.github/workflows/run-it.yml
@@ -6,7 +6,9 @@ concurrency:
 on:
  merge_group:
  pull_request:
-    branches: [ main ]
+    branches:
+      - main
+      - 'release/**'

 env:
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
--- a/backend/danswer/background/connector_deletion.py
+++ b/backend/danswer/background/connector_deletion.py
@@ -148,7 +148,7 @@ def document_by_cc_pair_cleanup_task(
            if count == 1:
                # count == 1 means this is the only remaining cc_pair reference to the doc
                # delete it from vespa and the db
-                document_index.delete(doc_ids=[document_id])
+                document_index.delete_single(doc_id=document_id)
                delete_documents_complete__no_commit(
                    db_session=db_session,
                    document_ids=[document_id],
--- a/backend/danswer/document_index/interfaces.py
+++ b/backend/danswer/document_index/interfaces.py
@@ -156,6 +156,16 @@ class Deletable(abc.ABC):
    Class must implement the ability to delete document by their unique document ids.
    """

+    @abc.abstractmethod
+    def delete_single(self, doc_id: str) -> None:
+        """
+        Given a single document id, hard delete it from the document index
+
+        Parameters:
+        - doc_id: document id as specified by the connector
+        """
+        raise NotImplementedError
+
    @abc.abstractmethod
    def delete(self, doc_ids: list[str]) -> None:
        """
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -13,6 +13,7 @@ from typing import cast
 import httpx
 import requests

+from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
 from danswer.configs.chat_configs import DOC_TIME_DECAY
 from danswer.configs.chat_configs import NUM_RETURNED_HITS
 from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
@@ -479,6 +480,66 @@ class VespaIndex(DocumentIndex):
                    document_ids=doc_ids, index_name=index_name, http_client=http_client
                )

+    def delete_single(self, doc_id: str) -> None:
+        """Possibly faster overall than the delete method due to using a single
+        delete call with a selection query."""
+
+        # Vespa deletion is poorly documented ... luckily we found this
+        # https://docs.vespa.ai/en/operations/batch-delete.html#example
+
+        doc_id = replace_invalid_doc_id_characters(doc_id)
+
+        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
+        # indexing / updates / deletes since we have to make a large volume of requests.
+        index_names = [self.index_name]
+        if self.secondary_index_name:
+            index_names.append(self.secondary_index_name)
+
+        with httpx.Client(http2=True) as http_client:
+            for index_name in index_names:
+                params = httpx.QueryParams(
+                    {
+                        "selection": f"{index_name}.document_id=='{doc_id}'",
+                        "cluster": DOCUMENT_INDEX_NAME,
+                    }
+                )
+
+                total_chunks_deleted = 0
+                while True:
+                    try:
+                        resp = http_client.delete(
+                            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
+                            params=params,
+                        )
+                        resp.raise_for_status()
+                    except httpx.HTTPStatusError as e:
+                        logger.error(
+                            f"Failed to delete chunk, details: {e.response.text}"
+                        )
+                        raise
+
+                    resp_data = resp.json()
+
+                    if "documentCount" in resp_data:
+                        chunks_deleted = resp_data["documentCount"]
+                        total_chunks_deleted += chunks_deleted
+
+                    # Check for continuation token to handle pagination
+                    if "continuation" not in resp_data:
+                        break  # Exit loop if no continuation token
+
+                    if not resp_data["continuation"]:
+                        break  # Exit loop if continuation token is empty
+
+                    params = params.set("continuation", resp_data["continuation"])
+
+                logger.debug(
+                    f"VespaIndex.delete_single: "
+                    f"index={index_name} "
+                    f"doc={doc_id} "
+                    f"chunks_deleted={total_chunks_deleted}"
+                )
+
    def id_based_retrieval(
        self,
        chunk_requests: list[VespaChunkRequest],
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -4,7 +4,7 @@ asyncpg==0.27.0
 atlassian-python-api==3.37.0
 beautifulsoup4==4.12.2
 boto3==1.34.84
-celery==5.3.4
+celery==5.5.0b4
 chardet==5.2.0
 dask==2023.8.1
 ddtrace==2.6.5
Author	SHA1	Message	Date
Richard Kuo (Danswer)	1966127bd4	trivy workaround	2024-10-03 15:25:54 -07:00
rkuo-danswer	3ac84da698	Merge pull request #2676 from danswer-ai/hotfix/v0.7-vespa-delete-performance hotfix for vespa delete performance	2024-10-03 10:59:32 -07:00
rkuo-danswer	7c7f5b37f5	Merge pull request #2675 from danswer-ai/hotfix/v0.7-bump-celery bump celery	2024-10-03 10:59:14 -07:00
Richard Kuo (Danswer)	0bf9243891	Merge branch 'release/v0.7' of github.com:danswer-ai/danswer into hotfix/v0.7-bump-celery	2024-10-03 10:21:41 -07:00
Richard Kuo (Danswer)	cfe4bbe3c7	Merge branch 'release/v0.7' of github.com:danswer-ai/danswer into hotfix/v0.7-vespa-delete-performance	2024-10-03 10:21:23 -07:00
Richard Kuo (Danswer)	9d18b92b90	fix sync checks	2024-10-03 10:20:57 -07:00
Richard Kuo (Danswer)	74315e21b3	bump celery	2024-10-03 09:44:25 -07:00
Richard Kuo (Danswer)	f9a5b227a1	hotfix for vespa delete performance	2024-10-03 09:43:02 -07:00