Compare commits

...

2 Commits

Author SHA1 Message Date
Weves
327ababeee More logging 2025-09-26 10:54:04 -07:00
Weves
18f13bd2ed Fix pre-commit to exclude .venv directory from lazy import check 2025-09-25 12:59:49 -07:00
3 changed files with 27 additions and 4 deletions

View File

@@ -43,7 +43,7 @@ repos:
name: Check lazy imports are not directly imported
entry: python3 backend/scripts/check_lazy_imports.py
language: system
files: ^backend/.*\.py$
files: ^backend/(?!\.venv/).*\.py$
pass_filenames: false
# We would like to have a mypy pre-commit hook, but due to the fact that

View File

@@ -57,6 +57,8 @@ from onyx.connectors.sharepoint.connector_utils import get_sharepoint_external_a
from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import extract_text_and_images
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.file_processing.extract_file_text import is_accepted_file_ext
from onyx.file_processing.extract_file_text import OnyxExtensionType
from onyx.file_processing.file_validation import EXCLUDED_IMAGE_TYPES
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.utils.b64 import get_image_type_from_bytes
@@ -758,7 +760,7 @@ class SharepointConnector(
try:
site = self.graph_client.sites.get_by_url(site_descriptor.url)
drives = site.drives.get().execute_query()
logger.debug(f"Found drives: {[drive.name for drive in drives]}")
logger.info(f"Found drives: {[drive.name for drive in drives]}")
drives = [
drive
@@ -770,19 +772,23 @@ class SharepointConnector(
if drive is None:
logger.warning(f"Drive '{drive_name}' not found")
return []
logger.info(f"Found drive: {drive.name}")
try:
root_folder = drive.root
if site_descriptor.folder_path:
for folder_part in site_descriptor.folder_path.split("/"):
root_folder = root_folder.get_by_path(folder_part)
logger.info(f"Found root folder: {root_folder.name}")
# TODO: consider ways to avoid materializing the entire list of files in memory
query = root_folder.get_files(
recursive=True,
page_size=1000,
)
driveitems = query.execute_query()
logger.debug(f"Found {len(driveitems)} items in drive '{drive_name}'")
logger.info(f"Found {len(driveitems)} items in drive '{drive_name}'")
# Filter items based on folder path if specified
if site_descriptor.folder_path:
@@ -821,7 +827,7 @@ class SharepointConnector(
<= item.last_modified_datetime.replace(tzinfo=timezone.utc)
<= end
]
logger.debug(
logger.info(
f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
)
@@ -1408,6 +1414,9 @@ class SharepointConnector(
return checkpoint
try:
logger.info(
f"Fetching drive items for drive name: {current_drive_name}"
)
driveitems = self._get_drive_items_for_drive_name(
site_descriptor, current_drive_name, start_dt, end_dt
)
@@ -1441,6 +1450,12 @@ class SharepointConnector(
)
for driveitem in driveitems:
driveitem_extension = get_file_ext(driveitem.name)
if not is_accepted_file_ext(driveitem_extension, OnyxExtensionType.All):
logger.warning(
f"Skipping {driveitem.web_url} as it is not a supported file type"
)
continue
# Only yield empty documents if they are PDFs or images
should_yield_if_empty = (
driveitem_extension in ACCEPTED_IMAGE_FILE_EXTENSIONS
@@ -1464,6 +1479,10 @@ class SharepointConnector(
TextSection(link=driveitem.web_url, text="")
]
yield doc
else:
logger.warning(
f"Skipping {driveitem.web_url} as it is empty and not a PDF or image"
)
except Exception as e:
logger.warning(
f"Failed to process driveitem {driveitem.web_url}: {e}"

View File

@@ -104,6 +104,10 @@ def find_python_files(
if ignore_directories is None:
ignore_directories = set()
# Always ignore virtual environment directories
venv_dirs = {".venv", "venv", ".env", "env", "__pycache__"}
ignore_directories = ignore_directories.union(venv_dirs)
python_files = []
for file_path in backend_dir.glob("**/*.py"):
# Skip test files (they can contain test imports)