More logging

Fix pre-commit to exclude .venv directory from lazy import check
2026-02-21 17:55:45 +00:00 · 2025-09-26 10:54:04 -07:00 · 2025-09-25 12:59:49 -07:00
3 changed files with 27 additions and 4 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
        name: Check lazy imports are not directly imported
        entry: python3 backend/scripts/check_lazy_imports.py
        language: system
-        files: ^backend/.*\.py$
+        files: ^backend/(?!\.venv/).*\.py$
        pass_filenames: false

  # We would like to have a mypy pre-commit hook, but due to the fact that
--- a/backend/onyx/connectors/sharepoint/connector.py
+++ b/backend/onyx/connectors/sharepoint/connector.py
@@ -57,6 +57,8 @@ from onyx.connectors.sharepoint.connector_utils import get_sharepoint_external_a
 from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS
 from onyx.file_processing.extract_file_text import extract_text_and_images
 from onyx.file_processing.extract_file_text import get_file_ext
+from onyx.file_processing.extract_file_text import is_accepted_file_ext
+from onyx.file_processing.extract_file_text import OnyxExtensionType
 from onyx.file_processing.file_validation import EXCLUDED_IMAGE_TYPES
 from onyx.file_processing.image_utils import store_image_and_create_section
 from onyx.utils.b64 import get_image_type_from_bytes
@@ -758,7 +760,7 @@ class SharepointConnector(
        try:
            site = self.graph_client.sites.get_by_url(site_descriptor.url)
            drives = site.drives.get().execute_query()
-            logger.debug(f"Found drives: {[drive.name for drive in drives]}")
+            logger.info(f"Found drives: {[drive.name for drive in drives]}")

            drives = [
                drive
@@ -770,19 +772,23 @@ class SharepointConnector(
            if drive is None:
                logger.warning(f"Drive '{drive_name}' not found")
                return []
+
+            logger.info(f"Found drive: {drive.name}")
            try:
                root_folder = drive.root
                if site_descriptor.folder_path:
                    for folder_part in site_descriptor.folder_path.split("/"):
                        root_folder = root_folder.get_by_path(folder_part)

+                logger.info(f"Found root folder: {root_folder.name}")
+
                # TODO: consider ways to avoid materializing the entire list of files in memory
                query = root_folder.get_files(
                    recursive=True,
                    page_size=1000,
                )
                driveitems = query.execute_query()
-                logger.debug(f"Found {len(driveitems)} items in drive '{drive_name}'")
+                logger.info(f"Found {len(driveitems)} items in drive '{drive_name}'")

                # Filter items based on folder path if specified
                if site_descriptor.folder_path:
@@ -821,7 +827,7 @@ class SharepointConnector(
                        <= item.last_modified_datetime.replace(tzinfo=timezone.utc)
                        <= end
                    ]
-                    logger.debug(
+                    logger.info(
                        f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
                    )

@@ -1408,6 +1414,9 @@ class SharepointConnector(
                return checkpoint

            try:
+                logger.info(
+                    f"Fetching drive items for drive name: {current_drive_name}"
+                )
                driveitems = self._get_drive_items_for_drive_name(
                    site_descriptor, current_drive_name, start_dt, end_dt
                )
@@ -1441,6 +1450,12 @@ class SharepointConnector(
            )
            for driveitem in driveitems:
                driveitem_extension = get_file_ext(driveitem.name)
+                if not is_accepted_file_ext(driveitem_extension, OnyxExtensionType.All):
+                    logger.warning(
+                        f"Skipping {driveitem.web_url} as it is not a supported file type"
+                    )
+                    continue
+
                # Only yield empty documents if they are PDFs or images
                should_yield_if_empty = (
                    driveitem_extension in ACCEPTED_IMAGE_FILE_EXTENSIONS
@@ -1464,6 +1479,10 @@ class SharepointConnector(
                                TextSection(link=driveitem.web_url, text="")
                            ]
                            yield doc
+                        else:
+                            logger.warning(
+                                f"Skipping {driveitem.web_url} as it is empty and not a PDF or image"
+                            )
                except Exception as e:
                    logger.warning(
                        f"Failed to process driveitem {driveitem.web_url}: {e}"
--- a/backend/scripts/check_lazy_imports.py
+++ b/backend/scripts/check_lazy_imports.py
@@ -104,6 +104,10 @@ def find_python_files(
    if ignore_directories is None:
        ignore_directories = set()

+    # Always ignore virtual environment directories
+    venv_dirs = {".venv", "venv", ".env", "env", "__pycache__"}
+    ignore_directories = ignore_directories.union(venv_dirs)
+
    python_files = []
    for file_path in backend_dir.glob("**/*.py"):
        # Skip test files (they can contain test imports)
Author	SHA1	Message	Date
Weves	327ababeee	More logging	2025-09-26 10:54:04 -07:00
Weves	18f13bd2ed	Fix pre-commit to exclude .venv directory from lazy import check	2025-09-25 12:59:49 -07:00