Fix formatting in README.md

Update docs URL for drupal_wiki source
Merge remote-tracking branch 'origin/feat/drupal-wiki-connector' into feat/drupal-wiki-connector
2026-04-08 08:22:42 +00:00 · 2025-12-19 23:34:14 +01:00 · 2025-12-19 23:31:27 +01:00 · 2025-12-19 21:50:34 +01:00 · 2025-12-19 21:50:26 +01:00 · 2025-12-19 21:43:40 +01:00
17 changed files with 1177 additions and 4 deletions
--- a/backend/init.py
+++ b/backend/init.py
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -541,6 +541,11 @@ GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD = int(
    os.environ.get("GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
 )

+# Default size threshold for Drupal Wiki attachments (10MB)
+DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD = int(
+    os.environ.get("DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD", 10 * 1024 * 1024)
+)
+
 # Default size threshold for SharePoint files (20MB)
 SHAREPOINT_CONNECTOR_SIZE_THRESHOLD = int(
    os.environ.get("SHAREPOINT_CONNECTOR_SIZE_THRESHOLD", 20 * 1024 * 1024)
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -209,6 +209,7 @@ class DocumentSource(str, Enum):
    EGNYTE = "egnyte"
    AIRTABLE = "airtable"
    HIGHSPOT = "highspot"
+    DRUPAL_WIKI = "drupal_wiki"

    IMAP = "imap"
    BITBUCKET = "bitbucket"
@@ -628,6 +629,7 @@ project management, and collaboration tools into a single, customizable platform
    DocumentSource.EGNYTE: "egnyte - files",
    DocumentSource.AIRTABLE: "airtable - database",
    DocumentSource.HIGHSPOT: "highspot - CRM data",
+    DocumentSource.DRUPAL_WIKI: "drupal wiki - knowledge base content (pages, spaces, attachments)",
    DocumentSource.IMAP: "imap - email data",
    DocumentSource.TESTRAIL: "testrail - test case management tool for QA processes",
 }
--- a/backend/onyx/connectors/cross_connector_utils/miscellaneous_utils.py
+++ b/backend/onyx/connectors/cross_connector_utils/miscellaneous_utils.py
@@ -71,6 +71,13 @@ def time_str_to_utc(datetime_str: str) -> datetime:
    raise ValueError(f"Unable to parse datetime string: {datetime_str}")


+# TODO: use this function in other connectors
+def datetime_from_utc_timestamp(timestamp: int) -> datetime:
+    """Convert a Unix timestamp to a datetime object in UTC"""
+
+    return datetime.fromtimestamp(timestamp, tz=timezone.utc)
+
+
 def basic_expert_info_representation(info: BasicExpertInfo) -> str | None:
    if info.first_name and info.last_name:
        return f"{info.first_name} {info.middle_initial} {info.last_name}"
--- a/backend/onyx/connectors/drupal_wiki/init.py
+++ b/backend/onyx/connectors/drupal_wiki/init.py
--- a/backend/onyx/connectors/drupal_wiki/connector.py
+++ b/backend/onyx/connectors/drupal_wiki/connector.py
@@ -0,0 +1,936 @@
+import mimetypes
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import requests
+from typing_extensions import override
+
+from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
+from onyx.configs.app_configs import DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.constants import DocumentSource
+from onyx.configs.constants import FileOrigin
+from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
+    datetime_from_utc_timestamp,
+)
+from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rate_limit_builder
+from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rl_requests
+from onyx.connectors.drupal_wiki.models import DrupalWikiCheckpoint
+from onyx.connectors.drupal_wiki.models import DrupalWikiPage
+from onyx.connectors.drupal_wiki.models import DrupalWikiPageResponse
+from onyx.connectors.drupal_wiki.models import DrupalWikiSpaceResponse
+from onyx.connectors.drupal_wiki.utils import build_drupal_wiki_document_id
+from onyx.connectors.exceptions import ConnectorValidationError
+from onyx.connectors.interfaces import CheckpointedConnector
+from onyx.connectors.interfaces import CheckpointOutput
+from onyx.connectors.interfaces import ConnectorFailure
+from onyx.connectors.interfaces import GenerateSlimDocumentOutput
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import SlimConnector
+from onyx.connectors.models import ConnectorMissingCredentialError
+from onyx.connectors.models import Document
+from onyx.connectors.models import DocumentFailure
+from onyx.connectors.models import ImageSection
+from onyx.connectors.models import SlimDocument
+from onyx.connectors.models import TextSection
+from onyx.file_processing.extract_file_text import extract_text_and_images
+from onyx.file_processing.file_types import OnyxFileExtensions
+from onyx.file_processing.html_utils import parse_html_page_basic
+from onyx.file_processing.image_utils import store_image_and_create_section
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.b64 import get_image_type_from_bytes
+from onyx.utils.logger import setup_logger
+from onyx.utils.retry_wrapper import retry_builder
+
+logger = setup_logger()
+
+MAX_API_PAGE_SIZE = 2000  # max allowed by API
+DRUPAL_WIKI_SPACE_KEY = "space"
+
+SUPPORTED_ATTACHMENT_EXTENSIONS = OnyxFileExtensions.ALL_ALLOWED_EXTENSIONS
+
+
+rate_limited_get = retry_builder()(
+    rate_limit_builder(max_calls=10, period=1)(rl_requests.get)
+)
+
+
+class DrupalWikiConnector(
+    CheckpointedConnector[DrupalWikiCheckpoint],
+    SlimConnector,
+):
+    def __init__(
+        self,
+        base_url: str,
+        spaces: list[str] | None = None,
+        pages: list[str] | None = None,
+        include_all_spaces: bool = False,
+        batch_size: int = INDEX_BATCH_SIZE,
+        continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
+        drupal_wiki_scope: str | None = None,
+        include_attachments: bool = False,
+        allow_images: bool = False,
+    ) -> None:
+        """
+        Initialize the Drupal Wiki connector.
+
+        Args:
+            base_url: The base URL of the Drupal Wiki instance (e.g., https://help.drupal-wiki.com)
+            spaces: List of space IDs to index. If None and include_all_spaces is False, no spaces will be indexed.
+            pages: List of page IDs to index. If provided, only these specific pages will be indexed.
+            include_all_spaces: If True, all spaces will be indexed regardless of the spaces parameter.
+            batch_size: Number of documents to process in a batch.
+            continue_on_failure: If True, continue indexing even if some documents fail.
+            drupal_wiki_scope: The selected tab value from the frontend. If "all_spaces", all spaces will be indexed.
+            include_attachments: If True, enable processing of page attachments including images and documents.
+            allow_images: If True, enable processing of image attachments.
+        """
+        self.base_url = base_url.rstrip("/")
+        self.spaces = spaces or []
+        self.pages = pages or []
+
+        # Determine whether to include all spaces based on the selected tab
+        # If drupal_wiki_scope is "all_spaces", we should index all spaces
+        # If it's "specific_spaces", we should only index the specified spaces
+        # If it's None, we use the include_all_spaces parameter
+        if drupal_wiki_scope == "all_spaces":
+            logger.info("drupal_wiki_scope is 'all_spaces', will index all spaces")
+            self.include_all_spaces = True
+        elif drupal_wiki_scope == "specific_spaces":
+            logger.info(
+                "drupal_wiki_scope is 'specific_spaces', will only index specified spaces"
+            )
+            self.include_all_spaces = False
+        else:
+            logger.info(
+                f"drupal_wiki_scope not set, using include_all_spaces={include_all_spaces}"
+            )
+            self.include_all_spaces = include_all_spaces
+
+        self.batch_size = batch_size
+        self.continue_on_failure = continue_on_failure
+
+        # Attachment processing configuration
+        self.include_attachments = include_attachments
+        self.allow_images = allow_images
+
+        # Will be set by load_credentials
+        self.headers: dict[str, str] | None = None
+        self._api_token: str | None = None
+
+    def set_allow_images(self, value: bool) -> None:
+        logger.info(f"Setting allow_images to {value}.")
+        self.allow_images = value
+
+    def _get_page_attachments(self, page_id: int) -> list[dict[str, Any]]:
+        """
+        Get all attachments for a specific page.
+
+        Args:
+            page_id: ID of the page.
+
+        Returns:
+            List of attachment dictionaries.
+        """
+        url = f"{self.base_url}/api/rest/scope/api/attachment"
+        params = {"pageId": str(page_id)}
+        logger.info(f"Fetching attachments for page {page_id} from {url}")
+
+        try:
+            response = rate_limited_get(url, headers=self.headers, params=params)
+            response.raise_for_status()
+            attachments = response.json()
+            logger.info(f"Found {len(attachments)} attachments for page {page_id}")
+            return attachments
+        except Exception as e:
+            logger.warning(f"Failed to fetch attachments for page {page_id}: {e}")
+            return []
+
+    def _download_attachment(self, attachment_id: int) -> bytes:
+        """
+        Download attachment content.
+
+        Args:
+            attachment_id: ID of the attachment to download.
+
+        Returns:
+            Raw bytes of the attachment.
+        """
+        url = f"{self.base_url}/api/rest/scope/api/attachment/{attachment_id}/download"
+        logger.info(f"Downloading attachment {attachment_id} from {url}")
+
+        # Use headers without Accept for binary downloads
+        if self.headers is None:
+            raise ValueError(
+                "Headers not set in connector instance"
+            )  # because mypy complains
+        download_headers = {"Authorization": self.headers["Authorization"]}
+
+        response = rate_limited_get(url, headers=download_headers)
+        response.raise_for_status()
+
+        return response.content
+
+    def _validate_attachment_filetype(self, attachment: dict[str, Any]) -> bool:
+        """
+        Validate if the attachment file type is supported.
+
+        Args:
+            attachment: Attachment dictionary from Drupal Wiki API.
+
+        Returns:
+            True if the file type is supported, False otherwise.
+        """
+        file_name = attachment.get("fileName", "")
+        if not file_name:
+            return False
+
+        # Get file extension
+        file_extension = Path(file_name).suffix.lower()
+
+        if file_extension in SUPPORTED_ATTACHMENT_EXTENSIONS:
+            return True
+        logger.info(f"Unsupported file type: {file_extension} for {file_name}")
+        return False
+
+    def _get_media_type_from_filename(self, filename: str) -> str:
+        """
+        Get media type from filename using the standard mimetypes library.
+
+        Args:
+            filename: The filename.
+
+        Returns:
+            Media type string.
+        """
+        mime_type, _encoding = mimetypes.guess_type(filename)
+        return mime_type or "application/octet-stream"
+
+    def _process_attachment(
+        self,
+        attachment: dict[str, Any],
+        page_id: int,
+        download_url: str,
+    ) -> tuple[list[TextSection | ImageSection], str | None]:
+        """
+        Process a single attachment and return generated sections.
+
+        Args:
+            attachment: Attachment dictionary from Drupal Wiki API.
+            page_id: ID of the parent page.
+            download_url: Direct download URL for the attachment.
+
+        Returns:
+            Tuple of (sections, error_message). If error_message is not None, the
+            sections list should be treated as invalid.
+        """
+        sections: list[TextSection | ImageSection] = []
+
+        try:
+            if not self._validate_attachment_filetype(attachment):
+                return (
+                    [],
+                    f"Unsupported file type: {attachment.get('fileName', 'unknown')}",
+                )
+
+            attachment_id = attachment["id"]
+            file_name = attachment.get("fileName", f"attachment_{attachment_id}")
+            file_size = attachment.get("fileSize", 0)
+            media_type = self._get_media_type_from_filename(file_name)
+
+            if file_size > DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD:
+                return [], f"Attachment too large: {file_size} bytes"
+
+            try:
+                raw_bytes = self._download_attachment(attachment_id)
+            except Exception as e:
+                return [], f"Failed to download attachment: {e}"
+
+            if media_type.startswith("image/"):
+                if not self.allow_images:
+                    logger.info(
+                        "Skipping image attachment %s because allow_images is False",
+                        file_name,
+                    )
+                    return [], None
+
+                try:
+                    image_section, _ = store_image_and_create_section(
+                        image_data=raw_bytes,
+                        file_id=str(attachment_id),
+                        display_name=attachment.get(
+                            "name", attachment.get("fileName", "Unknown")
+                        ),
+                        link=download_url,
+                        media_type=media_type,
+                        file_origin=FileOrigin.CONNECTOR,
+                    )
+                    sections.append(image_section)
+                    logger.info("Stored image attachment with file name: %s", file_name)
+                except Exception as e:
+                    return [], f"Image storage failed: {e}"
+
+                return sections, None
+
+            image_counter = 0
+
+            def _store_embedded_image(image_data: bytes, image_name: str) -> None:
+                nonlocal image_counter
+
+                if not self.allow_images:
+                    return
+
+                media_for_image = self._get_media_type_from_filename(image_name)
+                if media_for_image == "application/octet-stream":
+                    try:
+                        media_for_image = get_image_type_from_bytes(image_data)
+                    except ValueError:
+                        logger.debug(
+                            "Unable to determine media type for embedded image %s on attachment %s",
+                            image_name,
+                            file_name,
+                        )
+
+                image_counter += 1
+                display_name = (
+                    image_name
+                    or f"{attachment.get('name', file_name)} - embedded image {image_counter}"
+                )
+
+                try:
+                    image_section, _ = store_image_and_create_section(
+                        image_data=image_data,
+                        file_id=f"{attachment_id}_embedded_{image_counter}",
+                        display_name=display_name,
+                        link=download_url,
+                        media_type=media_for_image,
+                        file_origin=FileOrigin.CONNECTOR,
+                    )
+                    sections.append(image_section)
+                except Exception as err:
+                    logger.warning(
+                        "Failed to store embedded image %s for attachment %s: %s",
+                        image_name or image_counter,
+                        file_name,
+                        err,
+                    )
+
+            extraction_result = extract_text_and_images(
+                file=BytesIO(raw_bytes),
+                file_name=file_name,
+                content_type=media_type,
+                image_callback=_store_embedded_image if self.allow_images else None,
+            )
+
+            text_content = extraction_result.text_content.strip()
+            if text_content:
+                sections.insert(0, TextSection(text=text_content, link=download_url))
+                logger.info(
+                    "Extracted %d characters from %s", len(text_content), file_name
+                )
+            elif not sections:
+                return [], f"No text extracted for {file_name}"
+
+            return sections, None
+
+        except Exception as e:
+            logger.error(
+                "Failed to process attachment %s on page %s: %s",
+                attachment.get("name", "unknown"),
+                page_id,
+                e,
+            )
+            return [], f"Failed to process attachment: {e}"
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        """
+        Load credentials for the Drupal Wiki connector.
+
+        Args:
+            credentials: Dictionary containing the API token.
+
+        Returns:
+            None
+        """
+        if "drupal_wiki_api_token" not in credentials:
+            raise ConnectorValidationError(
+                "API token is required for Drupal Wiki connector"
+            )
+
+        self._api_token = credentials["drupal_wiki_api_token"]
+        self.headers = {
+            "Accept": "application/json",
+            "Authorization": f"Bearer {self._api_token}",
+        }
+
+        return None
+
+    def _get_space_ids(self) -> list[int]:
+        """
+        Get all space IDs from the Drupal Wiki instance.
+
+        Returns:
+            List of space IDs (deduplicated). The list is sorted to be deterministic.
+        """
+        url = f"{self.base_url}/api/rest/scope/api/space"
+        size = MAX_API_PAGE_SIZE
+        page = 0
+        all_space_ids: set[int] = set()
+        has_more = True
+        last_num_ids = -1
+
+        while has_more and len(all_space_ids) > last_num_ids:
+            last_num_ids = len(all_space_ids)
+            params = {"size": size, "page": page}
+            logger.info(f"Fetching spaces from {url} (page={page}, size={size})")
+            response = rate_limited_get(url, headers=self.headers, params=params)
+            response.raise_for_status()
+            resp_json = response.json()
+            space_response = DrupalWikiSpaceResponse.model_validate(resp_json)
+
+            logger.info(f"Fetched {len(space_response.content)} spaces (page={page})")
+            # Collect ids into the set to deduplicate
+            for space in space_response.content:
+                all_space_ids.add(space.id)
+
+            # Continue if we got a full page, indicating there might be more
+            has_more = len(space_response.content) >= size
+
+            page += 1
+
+        # Return a deterministic, sorted list of ids
+        space_id_list = list(sorted(all_space_ids))
+        logger.info(f"Total spaces fetched: {len(space_id_list)}")
+        return space_id_list
+
+    def _get_pages_for_space(
+        self, space_id: int, modified_after: SecondsSinceUnixEpoch | None = None
+    ) -> list[DrupalWikiPage]:
+        """
+        Get all pages for a specific space, optionally filtered by modification time.
+
+        Args:
+            space_id: ID of the space.
+            modified_after: Only return pages modified after this timestamp (seconds since Unix epoch).
+
+        Returns:
+            List of DrupalWikiPage objects.
+        """
+        url = f"{self.base_url}/api/rest/scope/api/page"
+        size = MAX_API_PAGE_SIZE
+        page = 0
+        all_pages = []
+        has_more = True
+        max_pages = 100  # Safety limit to prevent infinite loops
+
+        while has_more and page < max_pages:
+            params: dict[str, str | int] = {
+                DRUPAL_WIKI_SPACE_KEY: str(space_id),
+                "size": size,
+                "page": page,
+            }
+
+            # Add modifiedAfter parameter if provided
+            if modified_after is not None:
+                params["modifiedAfter"] = int(modified_after)
+
+            logger.info(
+                f"Fetching pages for space {space_id} from {url} ({page=}, {size=}, {modified_after=})"
+            )
+            response = rate_limited_get(url, headers=self.headers, params=params)
+            response.raise_for_status()
+            resp_json = response.json()
+
+            try:
+                page_response = DrupalWikiPageResponse.model_validate(resp_json)
+            except Exception as e:
+                logger.error(f"Failed to validate Drupal Wiki page response: {e}")
+                logger.debug(f"Response data: {resp_json}")
+                raise ConnectorValidationError(f"Invalid API response format: {e}")
+
+            logger.info(
+                f"Fetched {len(page_response.content)} pages in space {space_id} (page={page})"
+            )
+
+            # Pydantic should automatically parse content items as DrupalWikiPage objects
+            # If validation fails, it will raise an exception which we should catch
+            all_pages.extend(page_response.content)
+
+            # Continue if we got a full page, indicating there might be more
+            has_more = len(page_response.content) >= size
+
+            page += 1
+
+        if page >= max_pages:
+            logger.warning(
+                f"Reached maximum page limit ({max_pages}) while fetching pages for space {space_id}"
+            )
+
+        logger.info(f"Total pages fetched for space {space_id}: {len(all_pages)}")
+        return all_pages
+
+    def _get_page_content(self, page_id: int) -> DrupalWikiPage:
+        """
+        Get the content of a specific page.
+
+        Args:
+            page_id: ID of the page.
+
+        Returns:
+            DrupalWikiPage object.
+        """
+        url = f"{self.base_url}/api/rest/scope/api/page/{page_id}"
+        response = rate_limited_get(url, headers=self.headers)
+        response.raise_for_status()
+
+        return DrupalWikiPage.model_validate(response.json())
+
+    def _process_page(self, page: DrupalWikiPage) -> Document | ConnectorFailure:
+        """
+        Process a page and convert it to a Document.
+
+        Args:
+            page: DrupalWikiPage object.
+
+        Returns:
+            Document object or ConnectorFailure.
+        """
+        try:
+            # Extract text from HTML, handle None body
+            text_content = parse_html_page_basic(page.body or "")
+
+            # Ensure text_content is a string, not None
+            if text_content is None:
+                text_content = ""
+
+            # Create document URL
+            page_url = build_drupal_wiki_document_id(self.base_url, page.id)
+
+            # Create sections with just the page content
+            sections: list[TextSection | ImageSection] = [
+                TextSection(text=text_content, link=page_url)
+            ]
+
+            # Only process attachments if self.include_attachments is True
+            if self.include_attachments:
+                attachments = self._get_page_attachments(page.id)
+                for attachment in attachments:
+                    logger.info(
+                        f"Processing attachment: {attachment.get('name', 'Unknown')} (ID: {attachment['id']})"
+                    )
+                    # Use downloadUrl from API; fallback to page URL
+                    raw_download = attachment.get("downloadUrl")
+                    if raw_download:
+                        download_url = (
+                            raw_download
+                            if raw_download.startswith("http")
+                            else f"{self.base_url.rstrip('/')}" + raw_download
+                        )
+                    else:
+                        download_url = page_url
+                    # Process the attachment
+                    attachment_sections, error = self._process_attachment(
+                        attachment, page.id, download_url
+                    )
+                    if error:
+                        logger.warning(
+                            "Error processing attachment %s: %s",
+                            attachment.get("name", "Unknown"),
+                            error,
+                        )
+                        continue
+
+                    if attachment_sections:
+                        sections.extend(attachment_sections)
+                        logger.info(
+                            "Added %d section(s) for attachment %s",
+                            len(attachment_sections),
+                            attachment.get("name", "Unknown"),
+                        )
+
+            # Create metadata
+            metadata: dict[str, str | list[str]] = {
+                "space_id": str(page.homeSpace),
+                "page_id": str(page.id),
+                "type": page.type,
+            }
+
+            # Create document
+            return Document(
+                id=page_url,
+                sections=sections,
+                source=DocumentSource.DRUPAL_WIKI,
+                semantic_identifier=page.title,
+                metadata=metadata,
+                doc_updated_at=datetime_from_utc_timestamp(page.lastModified),
+            )
+        except Exception as e:
+            logger.error(f"Error processing page {page.id}: {e}")
+            return ConnectorFailure(
+                failed_document=DocumentFailure(
+                    document_id=str(page.id),
+                    document_link=build_drupal_wiki_document_id(self.base_url, page.id),
+                ),
+                failure_message=f"Error processing page {page.id}: {e}",
+                exception=e,
+            )
+
+    @override
+    def load_from_checkpoint(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: DrupalWikiCheckpoint,
+    ) -> CheckpointOutput[DrupalWikiCheckpoint]:
+        """
+        Load documents from a checkpoint.
+
+        Args:
+            start: Start time as seconds since Unix epoch.
+            end: End time as seconds since Unix epoch.
+            checkpoint: Checkpoint to resume from.
+
+        Returns:
+            Generator yielding documents and the updated checkpoint.
+        """
+        # Ensure page_ids is not None
+        if checkpoint.page_ids is None:
+            checkpoint.page_ids = []
+
+        # Initialize page_ids from self.pages if not already set
+        if not checkpoint.page_ids and self.pages:
+            logger.info(f"Initializing page_ids from self.pages: {self.pages}")
+            checkpoint.page_ids = [int(page_id.strip()) for page_id in self.pages]
+
+        # Ensure spaces is not None
+        if checkpoint.spaces is None:
+            checkpoint.spaces = []
+
+        while checkpoint.current_page_id_index < len(checkpoint.page_ids):
+            page_id = checkpoint.page_ids[checkpoint.current_page_id_index]
+            logger.info(f"Processing page ID: {page_id}")
+
+            try:
+                # Get the page content directly
+                page = self._get_page_content(page_id)
+
+                # Skip pages outside the time range
+                if not self._is_page_in_time_range(page.lastModified, start, end):
+                    logger.info(f"Skipping page {page_id} - outside time range")
+                    checkpoint.current_page_id_index += 1
+                    continue
+
+                # Process the page
+                doc_or_failure = self._process_page(page)
+                yield doc_or_failure
+
+            except Exception as e:
+                logger.error(f"Error processing page ID {page_id}: {e}")
+                yield ConnectorFailure(
+                    failed_document=DocumentFailure(
+                        document_id=str(page_id),
+                        document_link=build_drupal_wiki_document_id(
+                            self.base_url, page_id
+                        ),
+                    ),
+                    failure_message=f"Error processing page ID {page_id}: {e}",
+                    exception=e,
+                )
+
+            # Move to the next page ID
+            checkpoint.current_page_id_index += 1
+
+        # TODO: The main benefit of CheckpointedConnectors is that they can "save their work"
+        # by storing a checkpoint so transient errors are easy to recover from: simply resume
+        # from the last checkpoint. The way to get checkpoints saved is to return them somewhere
+        # in the middle of this function. The guarantee our checkpointing system gives to you,
+        # the connector implementer, is that when you return a checkpoint, this connector will
+        # at a later time (generally within a few seconds) call the load_from_checkpoint function
+        # again with the checkpoint you last returned as long as has_more=True.
+        #
+        # So, the current implementation doesn't take advantage of that system. This means that
+        # all runs of the connector will start from scratch, i.e. what you've implemented here is
+        # more akin to a LoadConnector.
+
+        # Process spaces if include_all_spaces is True or spaces are provided
+        if self.include_all_spaces or self.spaces:
+            logger.info("Processing spaces")
+            # If include_all_spaces is True, always fetch all spaces
+            if self.include_all_spaces:
+                logger.info("Fetching all spaces")
+                # Fetch all spaces
+                all_space_ids = self._get_space_ids()
+                # checkpoint.spaces expects a list of ints; assign returned list
+                checkpoint.spaces = all_space_ids
+                logger.info(f"Found {len(checkpoint.spaces)} spaces to process")
+            # Otherwise, use provided spaces if checkpoint is empty
+            elif not checkpoint.spaces:
+                logger.info(f"Using provided spaces: {self.spaces}")
+                # Use provided spaces
+                checkpoint.spaces = [int(space_id.strip()) for space_id in self.spaces]
+
+            # Process spaces from the checkpoint
+            while checkpoint.current_space_index < len(checkpoint.spaces):
+                space_id = checkpoint.spaces[checkpoint.current_space_index]
+                logger.info(f"Processing space ID: {space_id}")
+
+                # Get pages for the current space, filtered by start time if provided
+                pages = self._get_pages_for_space(space_id, modified_after=start)
+
+                # Process pages from the checkpoint
+                while checkpoint.current_page_index < len(pages):
+                    page = pages[checkpoint.current_page_index]
+                    logger.info(f"Processing page: {page.title} (ID: {page.id})")
+
+                    # For space-based pages, we already filtered by modifiedAfter in the API call
+                    # Only need to check the end time boundary
+                    if end and page.lastModified >= end:
+                        logger.info(
+                            f"Skipping page {page.id} - outside time range (after end)"
+                        )
+                        checkpoint.current_page_index += 1
+                        continue
+
+                    # Process the page
+                    doc_or_failure = self._process_page(page)
+                    yield doc_or_failure
+
+                    # Move to the next page
+                    checkpoint.current_page_index += 1
+
+                # Move to the next space
+                checkpoint.current_space_index += 1
+                checkpoint.current_page_index = 0
+
+        # All spaces and pages processed
+        logger.info("Finished processing all spaces and pages")
+        checkpoint.has_more = False
+        return checkpoint
+
+    @override
+    def build_dummy_checkpoint(self) -> DrupalWikiCheckpoint:
+        """
+        Build a dummy checkpoint.
+
+        Returns:
+            DrupalWikiCheckpoint with default values.
+        """
+        return DrupalWikiCheckpoint(
+            has_more=True,
+            current_space_index=0,
+            current_page_index=0,
+            current_page_id_index=0,
+            spaces=[],
+            page_ids=[],
+            is_processing_specific_pages=False,
+        )
+
+    @override
+    def validate_checkpoint_json(self, checkpoint_json: str) -> DrupalWikiCheckpoint:
+        """
+        Validate a checkpoint JSON string.
+
+        Args:
+            checkpoint_json: JSON string representing a checkpoint.
+
+        Returns:
+            Validated DrupalWikiCheckpoint.
+        """
+        return DrupalWikiCheckpoint.model_validate_json(checkpoint_json)
+
+    # TODO: unify approach with load_from_checkpoint.
+    # Ideally slim retrieval shares a lot of the same code with non-slim
+    # and we pass in a param is_slim to the main helper function
+    # that does the retrieval.
+    @override
+    def retrieve_all_slim_docs(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+        callback: IndexingHeartbeatInterface | None = None,
+    ) -> GenerateSlimDocumentOutput:
+        """
+        Retrieve all slim documents.
+
+        Args:
+            start: Start time as seconds since Unix epoch.
+            end: End time as seconds since Unix epoch.
+            callback: Callback for indexing heartbeat.
+
+        Returns:
+            Generator yielding batches of SlimDocument objects.
+        """
+        slim_docs: list[SlimDocument] = []
+        logger.info(
+            f"Starting retrieve_all_slim_docs with include_all_spaces={self.include_all_spaces}, spaces={self.spaces}"
+        )
+
+        # Process specific page IDs if provided
+        if self.pages:
+            logger.info(f"Processing specific pages: {self.pages}")
+            for page_id in self.pages:
+                try:
+                    # Get the page content directly
+                    page_content = self._get_page_content(int(page_id.strip()))
+
+                    # Skip pages outside the time range
+                    if not self._is_page_in_time_range(
+                        page_content.lastModified, start, end
+                    ):
+                        logger.info(f"Skipping page {page_id} - outside time range")
+                        continue
+
+                    # Create slim document for the page
+                    page_url = build_drupal_wiki_document_id(
+                        self.base_url, page_content.id
+                    )
+                    slim_docs.append(
+                        SlimDocument(
+                            id=page_url,
+                        )
+                    )
+                    logger.info(f"Added slim document for page {page_content.id}")
+
+                    # Process attachments for this page
+                    attachments = self._get_page_attachments(page_content.id)
+                    for attachment in attachments:
+                        if self._validate_attachment_filetype(attachment):
+                            attachment_url = f"{page_url}#attachment-{attachment['id']}"
+                            slim_docs.append(
+                                SlimDocument(
+                                    id=attachment_url,
+                                )
+                            )
+                            logger.info(
+                                f"Added slim document for attachment {attachment['id']}"
+                            )
+
+                    # Yield batch if it reaches the batch size
+                    if len(slim_docs) >= self.batch_size:
+                        logger.info(
+                            f"Yielding batch of {len(slim_docs)} slim documents"
+                        )
+                        yield slim_docs
+                        slim_docs = []
+
+                        if callback and callback.should_stop():
+                            return
+                        if callback:
+                            callback.progress("retrieve_all_slim_docs", 1)
+
+                except Exception as e:
+                    logger.error(
+                        f"Error processing page ID {page_id} for slim documents: {e}"
+                    )
+
+        # Process spaces if include_all_spaces is True or spaces are provided
+        if self.include_all_spaces or self.spaces:
+            logger.info("Processing spaces for slim documents")
+            # Get spaces to process
+            spaces_to_process = []
+            if self.include_all_spaces:
+                logger.info("Fetching all spaces for slim documents")
+                # Fetch all spaces
+                all_space_ids = self._get_space_ids()
+                spaces_to_process = all_space_ids
+                logger.info(f"Found {len(spaces_to_process)} spaces to process")
+            else:
+                logger.info(f"Using provided spaces: {self.spaces}")
+                # Use provided spaces
+                spaces_to_process = [int(space_id.strip()) for space_id in self.spaces]
+
+            # Process each space
+            for space_id in spaces_to_process:
+                logger.info(f"Processing space ID: {space_id}")
+                # Get pages for the current space, filtered by start time if provided
+                pages = self._get_pages_for_space(space_id, modified_after=start)
+
+                # Process each page
+                for page in pages:
+                    logger.info(f"Processing page: {page.title} (ID: {page.id})")
+                    # Skip pages outside the time range
+                    if end and page.lastModified >= end:
+                        logger.info(
+                            f"Skipping page {page.id} - outside time range (after end)"
+                        )
+                        continue
+
+                    # Create slim document for the page
+                    page_url = build_drupal_wiki_document_id(self.base_url, page.id)
+                    slim_docs.append(
+                        SlimDocument(
+                            id=page_url,
+                        )
+                    )
+                    logger.info(f"Added slim document for page {page.id}")
+
+                    # Process attachments for this page
+                    attachments = self._get_page_attachments(page.id)
+                    for attachment in attachments:
+                        if self._validate_attachment_filetype(attachment):
+                            attachment_url = f"{page_url}#attachment-{attachment['id']}"
+                            slim_docs.append(
+                                SlimDocument(
+                                    id=attachment_url,
+                                )
+                            )
+                            logger.info(
+                                f"Added slim document for attachment {attachment['id']}"
+                            )
+
+                    # Yield batch if it reaches the batch size
+                    if len(slim_docs) >= self.batch_size:
+                        logger.info(
+                            f"Yielding batch of {len(slim_docs)} slim documents"
+                        )
+                        yield slim_docs
+                        slim_docs = []
+
+                        if callback and callback.should_stop():
+                            return
+                        if callback:
+                            callback.progress("retrieve_all_slim_docs", 1)
+
+        # Yield remaining documents
+        if slim_docs:
+            logger.info(f"Yielding final batch of {len(slim_docs)} slim documents")
+            yield slim_docs
+
+    def validate_connector_settings(self) -> None:
+        """
+        Validate the connector settings.
+
+        Raises:
+            ConnectorValidationError: If the settings are invalid.
+        """
+        if not self.headers:
+            raise ConnectorMissingCredentialError("Drupal Wiki")
+
+        try:
+            # Try to fetch spaces to validate the connection
+            # Call the new helper which returns the list of space ids
+            self._get_space_ids()
+        except requests.exceptions.RequestException as e:
+            raise ConnectorValidationError(f"Failed to connect to Drupal Wiki: {e}")
+
+    def _is_page_in_time_range(
+        self,
+        last_modified: int,
+        start: SecondsSinceUnixEpoch | None,
+        end: SecondsSinceUnixEpoch | None,
+    ) -> bool:
+        """
+        Check if a page's last modified timestamp falls within the specified time range.
+
+        Args:
+            last_modified: The page's last modified timestamp.
+            start: Start time as seconds since Unix epoch (inclusive).
+            end: End time as seconds since Unix epoch (exclusive).
+
+        Returns:
+            True if the page is within the time range, False otherwise.
+        """
+        return (not start or last_modified >= start) and (
+            not end or last_modified < end
+        )
--- a/backend/onyx/connectors/drupal_wiki/models.py
+++ b/backend/onyx/connectors/drupal_wiki/models.py
@@ -0,0 +1,75 @@
+from enum import Enum
+from typing import Generic
+from typing import List
+from typing import Optional
+from typing import TypeVar
+
+from pydantic import BaseModel
+
+from onyx.connectors.interfaces import ConnectorCheckpoint
+
+
+class SpaceAccessStatus(str, Enum):
+    """Enum for Drupal Wiki space access status"""
+
+    PRIVATE = "PRIVATE"
+    ANONYMOUS = "ANONYMOUS"
+    AUTHENTICATED = "AUTHENTICATED"
+
+
+class DrupalWikiSpace(BaseModel):
+    """Model for a Drupal Wiki space"""
+
+    id: int
+    name: str
+    type: str
+    description: Optional[str] = None
+    accessStatus: Optional[SpaceAccessStatus] = None
+    color: Optional[str] = None
+
+
+class DrupalWikiPage(BaseModel):
+    """Model for a Drupal Wiki page"""
+
+    id: int
+    title: str
+    homeSpace: int
+    lastModified: int
+    type: str
+    body: Optional[str] = None
+
+
+T = TypeVar("T")
+
+
+class DrupalWikiBaseResponse(BaseModel, Generic[T]):
+    """Base model for Drupal Wiki API responses"""
+
+    totalPages: int
+    totalElements: int
+    size: int
+    content: List[T]
+    number: int
+    first: bool
+    last: bool
+    numberOfElements: int
+    empty: bool
+
+
+class DrupalWikiSpaceResponse(DrupalWikiBaseResponse[DrupalWikiSpace]):
+    """Model for the response from the Drupal Wiki spaces API"""
+
+
+class DrupalWikiPageResponse(DrupalWikiBaseResponse[DrupalWikiPage]):
+    """Model for the response from the Drupal Wiki pages API"""
+
+
+class DrupalWikiCheckpoint(ConnectorCheckpoint):
+    """Checkpoint for the Drupal Wiki connector"""
+
+    current_space_index: int = 0
+    current_page_index: int = 0
+    current_page_id_index: int = 0
+    spaces: List[int] = []
+    page_ids: List[int] = []
+    is_processing_specific_pages: bool = False
--- a/backend/onyx/connectors/drupal_wiki/utils.py
+++ b/backend/onyx/connectors/drupal_wiki/utils.py
@@ -0,0 +1,20 @@
+from datetime import datetime
+from datetime import timezone
+
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def build_drupal_wiki_document_id(base_url: str, page_id: int) -> str:
+    """Build a document ID for a Drupal Wiki page using the real URL format"""
+    # Ensure base_url ends with a slash
+    if not base_url.endswith("/"):
+        base_url += "/"
+    return f"{base_url}node/{page_id}"
+
+
+def datetime_from_timestamp(timestamp: int) -> datetime:
+    """Convert a Unix timestamp to a datetime object in UTC"""
+
+    return datetime.fromtimestamp(timestamp, tz=timezone.utc)
--- a/backend/onyx/connectors/registry.py
+++ b/backend/onyx/connectors/registry.py
@@ -196,6 +196,10 @@ CONNECTOR_CLASS_MAP = {
        module_path="onyx.connectors.highspot.connector",
        class_name="HighspotConnector",
    ),
+    DocumentSource.DRUPAL_WIKI: ConnectorMapping(
+        module_path="onyx.connectors.drupal_wiki.connector",
+        class_name="DrupalWikiConnector",
+    ),
    DocumentSource.IMAP: ConnectorMapping(
        module_path="onyx.connectors.imap.connector",
        class_name="ImapConnector",
--- a/web/public/DrupalWiki.png
+++ b/web/public/DrupalWiki.png
--- a/web/src/app/admin/connectors/[connector]/pages/FieldRendering.tsx
+++ b/web/src/app/admin/connectors/[connector]/pages/FieldRendering.tsx
@@ -17,6 +17,9 @@ import {
 } from "@/components/ui/fully_wrapped_tabs";
 import { useFormikContext } from "formik";

+// Define a general type for form values
+type FormValues = Record<string, any>;
+
 interface TabsFieldProps {
  tabField: TabOption;
  values: any;
@@ -30,6 +33,22 @@ const TabsField: FC<TabsFieldProps> = ({
  connector,
  currentCredential,
 }) => {
+  const { setFieldValue } = useFormikContext<FormValues>(); // Initialize the tab value if not set
+  useEffect(() => {
+    if (!values[tabField.name] && tabField.tabs.length > 0) {
+      setFieldValue(
+        tabField.name,
+        tabField.defaultTab || tabField.tabs[0]?.value
+      );
+    }
+  }, [
+    tabField.name,
+    tabField.defaultTab,
+    tabField.tabs,
+    setFieldValue,
+    values,
+  ]);
+
  return (
    <div className="w-full">
      {tabField.label && (
@@ -54,16 +73,19 @@ const TabsField: FC<TabsFieldProps> = ({
        <div className="text-sm text-muted-foreground">No tabs to display.</div>
      ) : (
        <Tabs
-          defaultValue={tabField.tabs[0]?.value} // Optional chaining for safety, though the length check above handles it
+          value={values[tabField.name] || tabField.tabs[0]?.value} // Use current form value or default
          className="w-full"
          onValueChange={(newTab) => {
+            // Update the tab field value
+            setFieldValue(tabField.name, newTab);
+
            // Clear values from other tabs but preserve defaults
            tabField.tabs.forEach((tab) => {
              if (tab.value !== newTab) {
                tab.fields.forEach((field) => {
                  // Only clear if not default value
                  if (values[field.name] !== field.default) {
-                    values[field.name] = field.default;
+                    setFieldValue(field.name, field.default);
                  }
                });
              }
@@ -128,7 +150,7 @@ export const RenderField: FC<RenderFieldProps> = ({
  connector,
  currentCredential,
 }) => {
-  const { setFieldValue } = useFormikContext<any>(); // Get Formik's context functions
+  const { setFieldValue } = useFormikContext<FormValues>(); // Get Formik's context functions

  const label =
    typeof field.label === "function"
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -37,6 +37,7 @@ import discordIcon from "@public/discord.png";
 import discourseIcon from "@public/Discourse.png";
 import document360Icon from "@public/Document360.png";
 import dropboxIcon from "@public/Dropbox.png";
+import drupalwikiIcon from "@public/DrupalWiki.png";
 import egnyteIcon from "@public/Egnyte.png";
 import firefliesIcon from "@public/Fireflies.png";
 import freshdeskIcon from "@public/Freshdesk.png";
@@ -834,6 +835,7 @@ export const DeepseekIcon = createLogoIcon(deepseekSVG);
 export const DiscourseIcon = createLogoIcon(discourseIcon);
 export const Document360Icon = createLogoIcon(document360Icon);
 export const DropboxIcon = createLogoIcon(dropboxIcon);
+export const DrupalWikiIcon = createLogoIcon(drupalwikiIcon);
 export const EgnyteIcon = createLogoIcon(egnyteIcon);
 export const FirefliesIcon = createLogoIcon(firefliesIcon);
 export const FreshdeskIcon = createLogoIcon(freshdeskIcon);
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -875,6 +875,79 @@ export const connectorConfigs: Record<
    ],
    advanced_values: [],
  },
+  drupal_wiki: {
+    description: "Configure Drupal Wiki connector",
+    values: [
+      {
+        type: "text",
+        query: "Enter the base URL of the Drupal Wiki instance:",
+        label: "Base URL",
+        name: "base_url",
+        optional: false,
+        description:
+          "The base URL of your Drupal Wiki instance (e.g., https://help.drupal-wiki.com )",
+      },
+      {
+        type: "tab",
+        name: "drupal_wiki_scope",
+        label: "What should we index from Drupal Wiki?",
+        optional: true,
+        tabs: [
+          {
+            value: "all_spaces",
+            label: "All Spaces",
+            fields: [
+              {
+                type: "string_tab",
+                label: "All Spaces",
+                name: "all_spaces_description",
+                description:
+                  "This connector will index all spaces the provided credentials have access to!",
+              },
+            ],
+          },
+          {
+            value: "specific_spaces",
+            label: "Specific Spaces and/or Pages",
+            fields: [
+              {
+                type: "list",
+                query: "Enter space IDs to include:",
+                label: "Space IDs",
+                name: "spaces",
+                description:
+                  "Specify one or more space IDs to index. You can include spaces even if you also specify page IDs below. Only numeric values are allowed.",
+                optional: true,
+                transform: (values) =>
+                  values.filter((value) => /^\d+$/.test(value.trim())),
+              },
+              {
+                type: "list",
+                query: "Enter page IDs to include:",
+                label: "Page IDs",
+                name: "pages",
+                description:
+                  "Specify one or more page IDs to index. You can include specific pages even if you also specify spaces above. Only numeric values are allowed.",
+                optional: true,
+                transform: (values) =>
+                  values.filter((value) => /^\d+$/.test(value.trim())),
+              },
+            ],
+          },
+        ],
+      },
+      {
+        type: "checkbox",
+        query: "Include attachments?",
+        label: "Include Attachments",
+        name: "include_attachments",
+        description:
+          "Enable processing of page attachments including images and documents",
+        default: false,
+      },
+    ],
+    advanced_values: [],
+  },
  axero: {
    description: "Configure Axero connector",
    values: [
@@ -1802,6 +1875,15 @@ export interface AxeroConfig {
  spaces?: string[];
 }

+export interface DrupalWikiConfig {
+  base_url: string;
+  spaces?: string[];
+  pages?: string[];
+  include_all_spaces?: boolean;
+  drupal_wiki_scope?: string;
+  include_attachments?: boolean;
+}
+
 export interface TeamsConfig {
  teams?: string[];
 }
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@@ -266,6 +266,10 @@ export interface HighspotCredentialJson {
  highspot_secret: string;
 }

+export interface DrupalWikiCredentialJson {
+  drupal_wiki_api_token: string;
+}
+
 export interface ImapCredentialJson {
  imap_username: string;
  imap_password: string;
@@ -448,6 +452,9 @@ export const credentialTemplates: Record<ValidSources, any> = {
  airtable: {
    airtable_access_token: "",
  } as AirtableCredentialJson,
+  drupal_wiki: {
+    drupal_wiki_api_token: "",
+  } as DrupalWikiCredentialJson,
  xenforo: null,
  google_sites: null,
  file: null,
@@ -641,6 +648,9 @@ export const credentialDisplayNames: Record<string, string> = {
  highspot_key: "Highspot Key",
  highspot_secret: "Highspot Secret",

+  // Drupal Wiki
+  drupal_wiki_api_token: "Drupal Wiki Personal Access Token",
+
  // Bitbucket
  bitbucket_email: "Bitbucket Account Email",
  bitbucket_api_token: "Bitbucket API Token",
--- a/web/src/lib/generated/README.md
+++ b/web/src/lib/generated/README.md
@@ -1,2 +1,2 @@
 - Generated Files
-* Generated files live here. This directory should be git ignored.
+* Generated files live here. This directory should be git ignored.
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@@ -44,6 +44,7 @@ import {
  AirtableIcon,
  GitbookIcon,
  HighspotIcon,
+  DrupalWikiIcon,
  EmailIcon,
  TestRailIcon,
 } from "@/components/icons/icons";
@@ -300,6 +301,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
    category: SourceCategory.Messaging,
    docs: `${DOCS_ADMINS_PATH}/connectors/official/gmail/overview`,
  },
+  drupal_wiki: {
+    icon: DrupalWikiIcon,
+    displayName: "Drupal Wiki",
+    category: SourceCategory.Wiki,
+    docs: `${DOCS_ADMINS_PATH}/connectors/official/drupal_wiki`,
+  },
  imap: {
    icon: EmailIcon,
    displayName: "Email",
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -501,6 +501,7 @@ export enum ValidSources {
  Airtable = "airtable",
  Gitbook = "gitbook",
  Highspot = "highspot",
+  DrupalWiki = "drupal_wiki",
  Imap = "imap",
  Bitbucket = "bitbucket",
  TestRail = "testrail",
Author	SHA1	Message	Date
rexjohannes	25e3483a1e	Fix formatting in README.md	2025-12-19 23:34:14 +01:00
rexjohannes	948c567a84	Update docs URL for drupal_wiki source	2025-12-19 23:31:27 +01:00
rexjohannes	436bbe4870	Merge remote-tracking branch 'origin/feat/drupal-wiki-connector' into feat/drupal-wiki-connector	2025-12-19 21:50:34 +01:00
rexjohannes	8c9e5641a7	feat(connector): update type hints to use built-in list and dict syntax	2025-12-19 21:50:26 +01:00
rexjohannes	70b29e2334	Merge branch 'main' into feat/drupal-wiki-connector	2025-12-19 21:43:40 +01:00
rexjohannes	36f67c0bab	remove: delete drupal-wiki connector tests and fixture	2025-12-19 21:43:15 +01:00
rexjohannes	acbbc4f62c	Merge remote-tracking branch 'origin/feat/drupal-wiki-connector' into feat/drupal-wiki-connector	2025-12-19 21:28:45 +01:00
rexjohannes	8f199db845	feat(connector): initialize None values for headers, api_token, and checkpoint attributes	2025-12-19 21:28:28 +01:00
rexjohannes	228543fb52	Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector	2025-12-19 20:57:08 +01:00
rexjohannes	17b7b50301	feat(connector): enhance Drupal Wiki connector with improved space handling and updated models	2025-12-19 20:53:31 +01:00
rexjohannes	7755daa06d	Merge branch 'main' into feat/drupal-wiki-connector	2025-12-19 19:22:16 +01:00
rexjohannes	1a4cffd029	Merge branch 'main' into feat/drupal-wiki-connector	2025-10-12 19:40:42 +02:00
rexjohannes	239e127bf6	Forcing an empty commit.	2025-10-09 14:49:33 +00:00
rexjohannes	febaa55dd3	refactor(tests): format patching method for rate_limited_get in test cases	2025-10-09 14:38:48 +00:00
rexjohannes	5dbf60342d	refactor(tests): update patching method for rate_limited_get in test cases	2025-10-09 14:32:25 +00:00
rexjohannes	13c658e158	refactor(connector.py): move import of override from typing_extensions	2025-10-09 14:12:39 +00:00
rexjohannes	200bcf8ba6	refactor(Drupal Wiki Connector): Clean up import statements and improve logging format	2025-10-09 14:04:49 +00:00
rexjohannes	a589c5b78a	Merge branch 'main' into feat/drupal-wiki-connector	2025-10-09 15:47:54 +02:00
rexjohannes	0e73642b52	refactor(Drupal Wiki): Remove unused HTML text extraction function and optimize page fetching logic	2025-10-09 13:32:36 +00:00
rexjohannes	0307eec14c	feat(Drupal Wiki): Add attachment size threshold and improve file type validation	2025-10-08 16:13:56 +00:00
rexjohannes	b6a1e22adb	Merge branch 'main' into feat/drupal-wiki-connector	2025-10-08 16:53:50 +02:00
rexjohannes	e0ca0b32b1	feat: add Drupal Wiki as a document source and update test to use new method name	2025-10-06 22:58:11 +00:00
rexjohannes	4d51a3cd32	fix: update Drupal Wiki base URL description and add connector mapping	2025-10-06 22:29:41 +00:00
rexjohannes	d931ca3e99	refactor: rename retrieve_all_slim_documents to retrieve_all_slim_docs and add backward compatibility wrapper	2025-10-06 22:09:09 +00:00
rexjohannes	0d61b02736	fix: remove unused connector imports from factory.py	2025-10-06 21:55:00 +00:00
rexjohannes	b215e69200	Merge branch 'main' into feat/drupal-wiki-connector	2025-10-06 21:47:46 +00:00
rexjohannes	5cf3d273ff	feat: streamline image storage process by removing unnecessary session context	2025-07-30 16:17:23 +02:00
rexjohannes	00717ba1ad	Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector	2025-07-30 14:55:11 +02:00
rexjohannes	b5042c3856	Merge branch 'main' into feat/drupal-wiki-connector	2025-07-24 23:32:09 +02:00
rexjohannes	3a89549b6a	feat: update attachment processing to use direct download URLs	2025-07-12 01:56:20 +02:00
rexjohannes	5ee3b2538e	Merge branch 'main' into feat/drupal-wiki-connector	2025-07-12 01:00:15 +02:00
rexjohannes	0975b3878b	refactor: update import path and correct parameter names in image processing	2025-07-10 17:19:53 +02:00
rexjohannes	fef2acb45c	Merge branch 'main' into feat/drupal-wiki-connector	2025-07-10 13:33:41 +02:00
rexjohannes	b9bc5356f9	Merge branch 'main' into feat/drupal-wiki-connector	2025-07-07 23:59:12 +02:00
rexjohannes	e872956b8f	Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector	2025-06-20 22:46:05 +02:00
rexjohannes	6afba2787f	remove perm_sync_data from SlimDocument in DrupalWikiConnector	2025-06-10 09:58:57 +00:00
rexjohannes	c52fac3683	Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector	2025-06-10 11:30:59 +02:00
rexjohannes	dcd0526832	Fix formatting in Drupal wiki test and apply prettier formatting to web components - Fixed indentation and formatting in drupal wiki test - Applied prettier formatting across web components - Ensured pre-commit hooks pass correctly	2025-06-07 21:17:54 +00:00
rexjohannes	6a16586a5a	Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector	2025-06-07 22:40:54 +02:00
rexjohannes	313af6474b	fix: improve formatting of mock API response in test for Drupal Wiki connector	2025-06-06 21:58:36 +00:00
rexjohannes	c82335907a	fix mypy-check fix https://github.com/onyx-dot-app/onyx/actions/runs/15486641039/job/43602716159?pr=4773	2025-06-06 12:42:43 +02:00
rexjohannes	109cc2aa1b	fix: merge conflicts	2025-06-06 08:49:27 +00:00
rexjohannes	de99704dfa	Update backend/onyx/connectors/drupal_wiki/connector.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>	2025-05-26 21:43:56 +02:00
rexjohannes	c0305a9e47	fix: Refactor code for improved readability and consistency in various components	2025-05-26 20:33:47 +02:00
rexjohannes	95aed16535	fix: Update tab handling to use current form value and improve state management	2025-05-26 00:15:20 +02:00
rexjohannes	2745d28d8a	fix: Ensure safe access to tab values and handle empty tab lists	2025-05-25 23:46:56 +02:00
rexjohannes	3dbdf33bbd	Merge branch 'main' into feat/drupal-wiki-connector	2025-05-25 23:05:58 +02:00
rexjohannes	83c3163597	feat: Implement Drupal Wiki connector with configuration and document processing	2025-05-25 22:59:57 +02:00
rexjohannes	1c37dff80c	feat: Add DrupalWiki connector	2025-05-25 22:55:15 +02:00