Compare commits

...

49 Commits

Author SHA1 Message Date
rexjohannes
25e3483a1e Fix formatting in README.md 2025-12-19 23:34:14 +01:00
rexjohannes
948c567a84 Update docs URL for drupal_wiki source 2025-12-19 23:31:27 +01:00
rexjohannes
436bbe4870 Merge remote-tracking branch 'origin/feat/drupal-wiki-connector' into feat/drupal-wiki-connector 2025-12-19 21:50:34 +01:00
rexjohannes
8c9e5641a7 feat(connector): update type hints to use built-in list and dict syntax 2025-12-19 21:50:26 +01:00
rexjohannes
70b29e2334 Merge branch 'main' into feat/drupal-wiki-connector 2025-12-19 21:43:40 +01:00
rexjohannes
36f67c0bab remove: delete drupal-wiki connector tests and fixture 2025-12-19 21:43:15 +01:00
rexjohannes
acbbc4f62c Merge remote-tracking branch 'origin/feat/drupal-wiki-connector' into feat/drupal-wiki-connector 2025-12-19 21:28:45 +01:00
rexjohannes
8f199db845 feat(connector): initialize None values for headers, api_token, and checkpoint attributes 2025-12-19 21:28:28 +01:00
rexjohannes
228543fb52 Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector 2025-12-19 20:57:08 +01:00
rexjohannes
17b7b50301 feat(connector): enhance Drupal Wiki connector with improved space handling and updated models 2025-12-19 20:53:31 +01:00
rexjohannes
7755daa06d Merge branch 'main' into feat/drupal-wiki-connector 2025-12-19 19:22:16 +01:00
rexjohannes
1a4cffd029 Merge branch 'main' into feat/drupal-wiki-connector 2025-10-12 19:40:42 +02:00
rexjohannes
239e127bf6 Forcing an empty commit. 2025-10-09 14:49:33 +00:00
rexjohannes
febaa55dd3 refactor(tests): format patching method for rate_limited_get in test cases 2025-10-09 14:38:48 +00:00
rexjohannes
5dbf60342d refactor(tests): update patching method for rate_limited_get in test cases 2025-10-09 14:32:25 +00:00
rexjohannes
13c658e158 refactor(connector.py): move import of override from typing_extensions 2025-10-09 14:12:39 +00:00
rexjohannes
200bcf8ba6 refactor(Drupal Wiki Connector): Clean up import statements and improve logging format 2025-10-09 14:04:49 +00:00
rexjohannes
a589c5b78a Merge branch 'main' into feat/drupal-wiki-connector 2025-10-09 15:47:54 +02:00
rexjohannes
0e73642b52 refactor(Drupal Wiki): Remove unused HTML text extraction function and optimize page fetching logic 2025-10-09 13:32:36 +00:00
rexjohannes
0307eec14c feat(Drupal Wiki): Add attachment size threshold and improve file type validation 2025-10-08 16:13:56 +00:00
rexjohannes
b6a1e22adb Merge branch 'main' into feat/drupal-wiki-connector 2025-10-08 16:53:50 +02:00
rexjohannes
e0ca0b32b1 feat: add Drupal Wiki as a document source and update test to use new method name 2025-10-06 22:58:11 +00:00
rexjohannes
4d51a3cd32 fix: update Drupal Wiki base URL description and add connector mapping 2025-10-06 22:29:41 +00:00
rexjohannes
d931ca3e99 refactor: rename retrieve_all_slim_documents to retrieve_all_slim_docs and add backward compatibility wrapper 2025-10-06 22:09:09 +00:00
rexjohannes
0d61b02736 fix: remove unused connector imports from factory.py 2025-10-06 21:55:00 +00:00
rexjohannes
b215e69200 Merge branch 'main' into feat/drupal-wiki-connector 2025-10-06 21:47:46 +00:00
rexjohannes
5cf3d273ff feat: streamline image storage process by removing unnecessary session context 2025-07-30 16:17:23 +02:00
rexjohannes
00717ba1ad Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector 2025-07-30 14:55:11 +02:00
rexjohannes
b5042c3856 Merge branch 'main' into feat/drupal-wiki-connector 2025-07-24 23:32:09 +02:00
rexjohannes
3a89549b6a feat: update attachment processing to use direct download URLs 2025-07-12 01:56:20 +02:00
rexjohannes
5ee3b2538e Merge branch 'main' into feat/drupal-wiki-connector 2025-07-12 01:00:15 +02:00
rexjohannes
0975b3878b refactor: update import path and correct parameter names in image processing 2025-07-10 17:19:53 +02:00
rexjohannes
fef2acb45c Merge branch 'main' into feat/drupal-wiki-connector 2025-07-10 13:33:41 +02:00
rexjohannes
b9bc5356f9 Merge branch 'main' into feat/drupal-wiki-connector 2025-07-07 23:59:12 +02:00
rexjohannes
e872956b8f Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector 2025-06-20 22:46:05 +02:00
rexjohannes
6afba2787f remove perm_sync_data from SlimDocument in DrupalWikiConnector 2025-06-10 09:58:57 +00:00
rexjohannes
c52fac3683 Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector 2025-06-10 11:30:59 +02:00
rexjohannes
dcd0526832 Fix formatting in Drupal wiki test and apply prettier formatting to web components
- Fixed indentation and formatting in drupal wiki test
- Applied prettier formatting across web components
- Ensured pre-commit hooks pass correctly
2025-06-07 21:17:54 +00:00
rexjohannes
6a16586a5a Merge branch 'onyx-dot-app:main' into feat/drupal-wiki-connector 2025-06-07 22:40:54 +02:00
rexjohannes
313af6474b fix: improve formatting of mock API response in test for Drupal Wiki connector 2025-06-06 21:58:36 +00:00
rexjohannes
c82335907a fix mypy-check
fix https://github.com/onyx-dot-app/onyx/actions/runs/15486641039/job/43602716159?pr=4773
2025-06-06 12:42:43 +02:00
rexjohannes
109cc2aa1b fix: merge conflicts 2025-06-06 08:49:27 +00:00
rexjohannes
de99704dfa Update backend/onyx/connectors/drupal_wiki/connector.py
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2025-05-26 21:43:56 +02:00
rexjohannes
c0305a9e47 fix: Refactor code for improved readability and consistency in various components 2025-05-26 20:33:47 +02:00
rexjohannes
95aed16535 fix: Update tab handling to use current form value and improve state management 2025-05-26 00:15:20 +02:00
rexjohannes
2745d28d8a fix: Ensure safe access to tab values and handle empty tab lists 2025-05-25 23:46:56 +02:00
rexjohannes
3dbdf33bbd Merge branch 'main' into feat/drupal-wiki-connector 2025-05-25 23:05:58 +02:00
rexjohannes
83c3163597 feat: Implement Drupal Wiki connector with configuration and document processing 2025-05-25 22:59:57 +02:00
rexjohannes
1c37dff80c feat: Add DrupalWiki connector 2025-05-25 22:55:15 +02:00
17 changed files with 1177 additions and 4 deletions

0
backend/__init__.py Normal file
View File

View File

@@ -541,6 +541,11 @@ GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD = int(
os.environ.get("GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
)
# Default size threshold for Drupal Wiki attachments (10MB)
DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD = int(
os.environ.get("DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD", 10 * 1024 * 1024)
)
# Default size threshold for SharePoint files (20MB)
SHAREPOINT_CONNECTOR_SIZE_THRESHOLD = int(
os.environ.get("SHAREPOINT_CONNECTOR_SIZE_THRESHOLD", 20 * 1024 * 1024)

View File

@@ -209,6 +209,7 @@ class DocumentSource(str, Enum):
EGNYTE = "egnyte"
AIRTABLE = "airtable"
HIGHSPOT = "highspot"
DRUPAL_WIKI = "drupal_wiki"
IMAP = "imap"
BITBUCKET = "bitbucket"
@@ -628,6 +629,7 @@ project management, and collaboration tools into a single, customizable platform
DocumentSource.EGNYTE: "egnyte - files",
DocumentSource.AIRTABLE: "airtable - database",
DocumentSource.HIGHSPOT: "highspot - CRM data",
DocumentSource.DRUPAL_WIKI: "drupal wiki - knowledge base content (pages, spaces, attachments)",
DocumentSource.IMAP: "imap - email data",
DocumentSource.TESTRAIL: "testrail - test case management tool for QA processes",
}

View File

@@ -71,6 +71,13 @@ def time_str_to_utc(datetime_str: str) -> datetime:
raise ValueError(f"Unable to parse datetime string: {datetime_str}")
# TODO: use this function in other connectors
def datetime_from_utc_timestamp(timestamp: int) -> datetime:
"""Convert a Unix timestamp to a datetime object in UTC"""
return datetime.fromtimestamp(timestamp, tz=timezone.utc)
def basic_expert_info_representation(info: BasicExpertInfo) -> str | None:
if info.first_name and info.last_name:
return f"{info.first_name} {info.middle_initial} {info.last_name}"

View File

@@ -0,0 +1,936 @@
import mimetypes
from io import BytesIO
from pathlib import Path
from typing import Any
import requests
from typing_extensions import override
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.app_configs import DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
datetime_from_utc_timestamp,
)
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rate_limit_builder
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rl_requests
from onyx.connectors.drupal_wiki.models import DrupalWikiCheckpoint
from onyx.connectors.drupal_wiki.models import DrupalWikiPage
from onyx.connectors.drupal_wiki.models import DrupalWikiPageResponse
from onyx.connectors.drupal_wiki.models import DrupalWikiSpaceResponse
from onyx.connectors.drupal_wiki.utils import build_drupal_wiki_document_id
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.interfaces import CheckpointedConnector
from onyx.connectors.interfaces import CheckpointOutput
from onyx.connectors.interfaces import ConnectorFailure
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_text_and_images
from onyx.file_processing.file_types import OnyxFileExtensions
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.b64 import get_image_type_from_bytes
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
logger = setup_logger()
MAX_API_PAGE_SIZE = 2000 # max allowed by API
DRUPAL_WIKI_SPACE_KEY = "space"
SUPPORTED_ATTACHMENT_EXTENSIONS = OnyxFileExtensions.ALL_ALLOWED_EXTENSIONS
rate_limited_get = retry_builder()(
rate_limit_builder(max_calls=10, period=1)(rl_requests.get)
)
class DrupalWikiConnector(
CheckpointedConnector[DrupalWikiCheckpoint],
SlimConnector,
):
def __init__(
self,
base_url: str,
spaces: list[str] | None = None,
pages: list[str] | None = None,
include_all_spaces: bool = False,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
drupal_wiki_scope: str | None = None,
include_attachments: bool = False,
allow_images: bool = False,
) -> None:
"""
Initialize the Drupal Wiki connector.
Args:
base_url: The base URL of the Drupal Wiki instance (e.g., https://help.drupal-wiki.com)
spaces: List of space IDs to index. If None and include_all_spaces is False, no spaces will be indexed.
pages: List of page IDs to index. If provided, only these specific pages will be indexed.
include_all_spaces: If True, all spaces will be indexed regardless of the spaces parameter.
batch_size: Number of documents to process in a batch.
continue_on_failure: If True, continue indexing even if some documents fail.
drupal_wiki_scope: The selected tab value from the frontend. If "all_spaces", all spaces will be indexed.
include_attachments: If True, enable processing of page attachments including images and documents.
allow_images: If True, enable processing of image attachments.
"""
self.base_url = base_url.rstrip("/")
self.spaces = spaces or []
self.pages = pages or []
# Determine whether to include all spaces based on the selected tab
# If drupal_wiki_scope is "all_spaces", we should index all spaces
# If it's "specific_spaces", we should only index the specified spaces
# If it's None, we use the include_all_spaces parameter
if drupal_wiki_scope == "all_spaces":
logger.info("drupal_wiki_scope is 'all_spaces', will index all spaces")
self.include_all_spaces = True
elif drupal_wiki_scope == "specific_spaces":
logger.info(
"drupal_wiki_scope is 'specific_spaces', will only index specified spaces"
)
self.include_all_spaces = False
else:
logger.info(
f"drupal_wiki_scope not set, using include_all_spaces={include_all_spaces}"
)
self.include_all_spaces = include_all_spaces
self.batch_size = batch_size
self.continue_on_failure = continue_on_failure
# Attachment processing configuration
self.include_attachments = include_attachments
self.allow_images = allow_images
# Will be set by load_credentials
self.headers: dict[str, str] | None = None
self._api_token: str | None = None
def set_allow_images(self, value: bool) -> None:
logger.info(f"Setting allow_images to {value}.")
self.allow_images = value
def _get_page_attachments(self, page_id: int) -> list[dict[str, Any]]:
"""
Get all attachments for a specific page.
Args:
page_id: ID of the page.
Returns:
List of attachment dictionaries.
"""
url = f"{self.base_url}/api/rest/scope/api/attachment"
params = {"pageId": str(page_id)}
logger.info(f"Fetching attachments for page {page_id} from {url}")
try:
response = rate_limited_get(url, headers=self.headers, params=params)
response.raise_for_status()
attachments = response.json()
logger.info(f"Found {len(attachments)} attachments for page {page_id}")
return attachments
except Exception as e:
logger.warning(f"Failed to fetch attachments for page {page_id}: {e}")
return []
def _download_attachment(self, attachment_id: int) -> bytes:
"""
Download attachment content.
Args:
attachment_id: ID of the attachment to download.
Returns:
Raw bytes of the attachment.
"""
url = f"{self.base_url}/api/rest/scope/api/attachment/{attachment_id}/download"
logger.info(f"Downloading attachment {attachment_id} from {url}")
# Use headers without Accept for binary downloads
if self.headers is None:
raise ValueError(
"Headers not set in connector instance"
) # because mypy complains
download_headers = {"Authorization": self.headers["Authorization"]}
response = rate_limited_get(url, headers=download_headers)
response.raise_for_status()
return response.content
def _validate_attachment_filetype(self, attachment: dict[str, Any]) -> bool:
"""
Validate if the attachment file type is supported.
Args:
attachment: Attachment dictionary from Drupal Wiki API.
Returns:
True if the file type is supported, False otherwise.
"""
file_name = attachment.get("fileName", "")
if not file_name:
return False
# Get file extension
file_extension = Path(file_name).suffix.lower()
if file_extension in SUPPORTED_ATTACHMENT_EXTENSIONS:
return True
logger.info(f"Unsupported file type: {file_extension} for {file_name}")
return False
def _get_media_type_from_filename(self, filename: str) -> str:
"""
Get media type from filename using the standard mimetypes library.
Args:
filename: The filename.
Returns:
Media type string.
"""
mime_type, _encoding = mimetypes.guess_type(filename)
return mime_type or "application/octet-stream"
def _process_attachment(
self,
attachment: dict[str, Any],
page_id: int,
download_url: str,
) -> tuple[list[TextSection | ImageSection], str | None]:
"""
Process a single attachment and return generated sections.
Args:
attachment: Attachment dictionary from Drupal Wiki API.
page_id: ID of the parent page.
download_url: Direct download URL for the attachment.
Returns:
Tuple of (sections, error_message). If error_message is not None, the
sections list should be treated as invalid.
"""
sections: list[TextSection | ImageSection] = []
try:
if not self._validate_attachment_filetype(attachment):
return (
[],
f"Unsupported file type: {attachment.get('fileName', 'unknown')}",
)
attachment_id = attachment["id"]
file_name = attachment.get("fileName", f"attachment_{attachment_id}")
file_size = attachment.get("fileSize", 0)
media_type = self._get_media_type_from_filename(file_name)
if file_size > DRUPAL_WIKI_ATTACHMENT_SIZE_THRESHOLD:
return [], f"Attachment too large: {file_size} bytes"
try:
raw_bytes = self._download_attachment(attachment_id)
except Exception as e:
return [], f"Failed to download attachment: {e}"
if media_type.startswith("image/"):
if not self.allow_images:
logger.info(
"Skipping image attachment %s because allow_images is False",
file_name,
)
return [], None
try:
image_section, _ = store_image_and_create_section(
image_data=raw_bytes,
file_id=str(attachment_id),
display_name=attachment.get(
"name", attachment.get("fileName", "Unknown")
),
link=download_url,
media_type=media_type,
file_origin=FileOrigin.CONNECTOR,
)
sections.append(image_section)
logger.info("Stored image attachment with file name: %s", file_name)
except Exception as e:
return [], f"Image storage failed: {e}"
return sections, None
image_counter = 0
def _store_embedded_image(image_data: bytes, image_name: str) -> None:
nonlocal image_counter
if not self.allow_images:
return
media_for_image = self._get_media_type_from_filename(image_name)
if media_for_image == "application/octet-stream":
try:
media_for_image = get_image_type_from_bytes(image_data)
except ValueError:
logger.debug(
"Unable to determine media type for embedded image %s on attachment %s",
image_name,
file_name,
)
image_counter += 1
display_name = (
image_name
or f"{attachment.get('name', file_name)} - embedded image {image_counter}"
)
try:
image_section, _ = store_image_and_create_section(
image_data=image_data,
file_id=f"{attachment_id}_embedded_{image_counter}",
display_name=display_name,
link=download_url,
media_type=media_for_image,
file_origin=FileOrigin.CONNECTOR,
)
sections.append(image_section)
except Exception as err:
logger.warning(
"Failed to store embedded image %s for attachment %s: %s",
image_name or image_counter,
file_name,
err,
)
extraction_result = extract_text_and_images(
file=BytesIO(raw_bytes),
file_name=file_name,
content_type=media_type,
image_callback=_store_embedded_image if self.allow_images else None,
)
text_content = extraction_result.text_content.strip()
if text_content:
sections.insert(0, TextSection(text=text_content, link=download_url))
logger.info(
"Extracted %d characters from %s", len(text_content), file_name
)
elif not sections:
return [], f"No text extracted for {file_name}"
return sections, None
except Exception as e:
logger.error(
"Failed to process attachment %s on page %s: %s",
attachment.get("name", "unknown"),
page_id,
e,
)
return [], f"Failed to process attachment: {e}"
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""
Load credentials for the Drupal Wiki connector.
Args:
credentials: Dictionary containing the API token.
Returns:
None
"""
if "drupal_wiki_api_token" not in credentials:
raise ConnectorValidationError(
"API token is required for Drupal Wiki connector"
)
self._api_token = credentials["drupal_wiki_api_token"]
self.headers = {
"Accept": "application/json",
"Authorization": f"Bearer {self._api_token}",
}
return None
def _get_space_ids(self) -> list[int]:
"""
Get all space IDs from the Drupal Wiki instance.
Returns:
List of space IDs (deduplicated). The list is sorted to be deterministic.
"""
url = f"{self.base_url}/api/rest/scope/api/space"
size = MAX_API_PAGE_SIZE
page = 0
all_space_ids: set[int] = set()
has_more = True
last_num_ids = -1
while has_more and len(all_space_ids) > last_num_ids:
last_num_ids = len(all_space_ids)
params = {"size": size, "page": page}
logger.info(f"Fetching spaces from {url} (page={page}, size={size})")
response = rate_limited_get(url, headers=self.headers, params=params)
response.raise_for_status()
resp_json = response.json()
space_response = DrupalWikiSpaceResponse.model_validate(resp_json)
logger.info(f"Fetched {len(space_response.content)} spaces (page={page})")
# Collect ids into the set to deduplicate
for space in space_response.content:
all_space_ids.add(space.id)
# Continue if we got a full page, indicating there might be more
has_more = len(space_response.content) >= size
page += 1
# Return a deterministic, sorted list of ids
space_id_list = list(sorted(all_space_ids))
logger.info(f"Total spaces fetched: {len(space_id_list)}")
return space_id_list
def _get_pages_for_space(
self, space_id: int, modified_after: SecondsSinceUnixEpoch | None = None
) -> list[DrupalWikiPage]:
"""
Get all pages for a specific space, optionally filtered by modification time.
Args:
space_id: ID of the space.
modified_after: Only return pages modified after this timestamp (seconds since Unix epoch).
Returns:
List of DrupalWikiPage objects.
"""
url = f"{self.base_url}/api/rest/scope/api/page"
size = MAX_API_PAGE_SIZE
page = 0
all_pages = []
has_more = True
max_pages = 100 # Safety limit to prevent infinite loops
while has_more and page < max_pages:
params: dict[str, str | int] = {
DRUPAL_WIKI_SPACE_KEY: str(space_id),
"size": size,
"page": page,
}
# Add modifiedAfter parameter if provided
if modified_after is not None:
params["modifiedAfter"] = int(modified_after)
logger.info(
f"Fetching pages for space {space_id} from {url} ({page=}, {size=}, {modified_after=})"
)
response = rate_limited_get(url, headers=self.headers, params=params)
response.raise_for_status()
resp_json = response.json()
try:
page_response = DrupalWikiPageResponse.model_validate(resp_json)
except Exception as e:
logger.error(f"Failed to validate Drupal Wiki page response: {e}")
logger.debug(f"Response data: {resp_json}")
raise ConnectorValidationError(f"Invalid API response format: {e}")
logger.info(
f"Fetched {len(page_response.content)} pages in space {space_id} (page={page})"
)
# Pydantic should automatically parse content items as DrupalWikiPage objects
# If validation fails, it will raise an exception which we should catch
all_pages.extend(page_response.content)
# Continue if we got a full page, indicating there might be more
has_more = len(page_response.content) >= size
page += 1
if page >= max_pages:
logger.warning(
f"Reached maximum page limit ({max_pages}) while fetching pages for space {space_id}"
)
logger.info(f"Total pages fetched for space {space_id}: {len(all_pages)}")
return all_pages
def _get_page_content(self, page_id: int) -> DrupalWikiPage:
"""
Get the content of a specific page.
Args:
page_id: ID of the page.
Returns:
DrupalWikiPage object.
"""
url = f"{self.base_url}/api/rest/scope/api/page/{page_id}"
response = rate_limited_get(url, headers=self.headers)
response.raise_for_status()
return DrupalWikiPage.model_validate(response.json())
def _process_page(self, page: DrupalWikiPage) -> Document | ConnectorFailure:
"""
Process a page and convert it to a Document.
Args:
page: DrupalWikiPage object.
Returns:
Document object or ConnectorFailure.
"""
try:
# Extract text from HTML, handle None body
text_content = parse_html_page_basic(page.body or "")
# Ensure text_content is a string, not None
if text_content is None:
text_content = ""
# Create document URL
page_url = build_drupal_wiki_document_id(self.base_url, page.id)
# Create sections with just the page content
sections: list[TextSection | ImageSection] = [
TextSection(text=text_content, link=page_url)
]
# Only process attachments if self.include_attachments is True
if self.include_attachments:
attachments = self._get_page_attachments(page.id)
for attachment in attachments:
logger.info(
f"Processing attachment: {attachment.get('name', 'Unknown')} (ID: {attachment['id']})"
)
# Use downloadUrl from API; fallback to page URL
raw_download = attachment.get("downloadUrl")
if raw_download:
download_url = (
raw_download
if raw_download.startswith("http")
else f"{self.base_url.rstrip('/')}" + raw_download
)
else:
download_url = page_url
# Process the attachment
attachment_sections, error = self._process_attachment(
attachment, page.id, download_url
)
if error:
logger.warning(
"Error processing attachment %s: %s",
attachment.get("name", "Unknown"),
error,
)
continue
if attachment_sections:
sections.extend(attachment_sections)
logger.info(
"Added %d section(s) for attachment %s",
len(attachment_sections),
attachment.get("name", "Unknown"),
)
# Create metadata
metadata: dict[str, str | list[str]] = {
"space_id": str(page.homeSpace),
"page_id": str(page.id),
"type": page.type,
}
# Create document
return Document(
id=page_url,
sections=sections,
source=DocumentSource.DRUPAL_WIKI,
semantic_identifier=page.title,
metadata=metadata,
doc_updated_at=datetime_from_utc_timestamp(page.lastModified),
)
except Exception as e:
logger.error(f"Error processing page {page.id}: {e}")
return ConnectorFailure(
failed_document=DocumentFailure(
document_id=str(page.id),
document_link=build_drupal_wiki_document_id(self.base_url, page.id),
),
failure_message=f"Error processing page {page.id}: {e}",
exception=e,
)
@override
def load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: DrupalWikiCheckpoint,
) -> CheckpointOutput[DrupalWikiCheckpoint]:
"""
Load documents from a checkpoint.
Args:
start: Start time as seconds since Unix epoch.
end: End time as seconds since Unix epoch.
checkpoint: Checkpoint to resume from.
Returns:
Generator yielding documents and the updated checkpoint.
"""
# Ensure page_ids is not None
if checkpoint.page_ids is None:
checkpoint.page_ids = []
# Initialize page_ids from self.pages if not already set
if not checkpoint.page_ids and self.pages:
logger.info(f"Initializing page_ids from self.pages: {self.pages}")
checkpoint.page_ids = [int(page_id.strip()) for page_id in self.pages]
# Ensure spaces is not None
if checkpoint.spaces is None:
checkpoint.spaces = []
while checkpoint.current_page_id_index < len(checkpoint.page_ids):
page_id = checkpoint.page_ids[checkpoint.current_page_id_index]
logger.info(f"Processing page ID: {page_id}")
try:
# Get the page content directly
page = self._get_page_content(page_id)
# Skip pages outside the time range
if not self._is_page_in_time_range(page.lastModified, start, end):
logger.info(f"Skipping page {page_id} - outside time range")
checkpoint.current_page_id_index += 1
continue
# Process the page
doc_or_failure = self._process_page(page)
yield doc_or_failure
except Exception as e:
logger.error(f"Error processing page ID {page_id}: {e}")
yield ConnectorFailure(
failed_document=DocumentFailure(
document_id=str(page_id),
document_link=build_drupal_wiki_document_id(
self.base_url, page_id
),
),
failure_message=f"Error processing page ID {page_id}: {e}",
exception=e,
)
# Move to the next page ID
checkpoint.current_page_id_index += 1
# TODO: The main benefit of CheckpointedConnectors is that they can "save their work"
# by storing a checkpoint so transient errors are easy to recover from: simply resume
# from the last checkpoint. The way to get checkpoints saved is to return them somewhere
# in the middle of this function. The guarantee our checkpointing system gives to you,
# the connector implementer, is that when you return a checkpoint, this connector will
# at a later time (generally within a few seconds) call the load_from_checkpoint function
# again with the checkpoint you last returned as long as has_more=True.
#
# So, the current implementation doesn't take advantage of that system. This means that
# all runs of the connector will start from scratch, i.e. what you've implemented here is
# more akin to a LoadConnector.
# Process spaces if include_all_spaces is True or spaces are provided
if self.include_all_spaces or self.spaces:
logger.info("Processing spaces")
# If include_all_spaces is True, always fetch all spaces
if self.include_all_spaces:
logger.info("Fetching all spaces")
# Fetch all spaces
all_space_ids = self._get_space_ids()
# checkpoint.spaces expects a list of ints; assign returned list
checkpoint.spaces = all_space_ids
logger.info(f"Found {len(checkpoint.spaces)} spaces to process")
# Otherwise, use provided spaces if checkpoint is empty
elif not checkpoint.spaces:
logger.info(f"Using provided spaces: {self.spaces}")
# Use provided spaces
checkpoint.spaces = [int(space_id.strip()) for space_id in self.spaces]
# Process spaces from the checkpoint
while checkpoint.current_space_index < len(checkpoint.spaces):
space_id = checkpoint.spaces[checkpoint.current_space_index]
logger.info(f"Processing space ID: {space_id}")
# Get pages for the current space, filtered by start time if provided
pages = self._get_pages_for_space(space_id, modified_after=start)
# Process pages from the checkpoint
while checkpoint.current_page_index < len(pages):
page = pages[checkpoint.current_page_index]
logger.info(f"Processing page: {page.title} (ID: {page.id})")
# For space-based pages, we already filtered by modifiedAfter in the API call
# Only need to check the end time boundary
if end and page.lastModified >= end:
logger.info(
f"Skipping page {page.id} - outside time range (after end)"
)
checkpoint.current_page_index += 1
continue
# Process the page
doc_or_failure = self._process_page(page)
yield doc_or_failure
# Move to the next page
checkpoint.current_page_index += 1
# Move to the next space
checkpoint.current_space_index += 1
checkpoint.current_page_index = 0
# All spaces and pages processed
logger.info("Finished processing all spaces and pages")
checkpoint.has_more = False
return checkpoint
@override
def build_dummy_checkpoint(self) -> DrupalWikiCheckpoint:
"""
Build a dummy checkpoint.
Returns:
DrupalWikiCheckpoint with default values.
"""
return DrupalWikiCheckpoint(
has_more=True,
current_space_index=0,
current_page_index=0,
current_page_id_index=0,
spaces=[],
page_ids=[],
is_processing_specific_pages=False,
)
@override
def validate_checkpoint_json(self, checkpoint_json: str) -> DrupalWikiCheckpoint:
"""
Validate a checkpoint JSON string.
Args:
checkpoint_json: JSON string representing a checkpoint.
Returns:
Validated DrupalWikiCheckpoint.
"""
return DrupalWikiCheckpoint.model_validate_json(checkpoint_json)
# TODO: unify approach with load_from_checkpoint.
# Ideally slim retrieval shares a lot of the same code with non-slim
# and we pass in a param is_slim to the main helper function
# that does the retrieval.
@override
def retrieve_all_slim_docs(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
"""
Retrieve all slim documents.
Args:
start: Start time as seconds since Unix epoch.
end: End time as seconds since Unix epoch.
callback: Callback for indexing heartbeat.
Returns:
Generator yielding batches of SlimDocument objects.
"""
slim_docs: list[SlimDocument] = []
logger.info(
f"Starting retrieve_all_slim_docs with include_all_spaces={self.include_all_spaces}, spaces={self.spaces}"
)
# Process specific page IDs if provided
if self.pages:
logger.info(f"Processing specific pages: {self.pages}")
for page_id in self.pages:
try:
# Get the page content directly
page_content = self._get_page_content(int(page_id.strip()))
# Skip pages outside the time range
if not self._is_page_in_time_range(
page_content.lastModified, start, end
):
logger.info(f"Skipping page {page_id} - outside time range")
continue
# Create slim document for the page
page_url = build_drupal_wiki_document_id(
self.base_url, page_content.id
)
slim_docs.append(
SlimDocument(
id=page_url,
)
)
logger.info(f"Added slim document for page {page_content.id}")
# Process attachments for this page
attachments = self._get_page_attachments(page_content.id)
for attachment in attachments:
if self._validate_attachment_filetype(attachment):
attachment_url = f"{page_url}#attachment-{attachment['id']}"
slim_docs.append(
SlimDocument(
id=attachment_url,
)
)
logger.info(
f"Added slim document for attachment {attachment['id']}"
)
# Yield batch if it reaches the batch size
if len(slim_docs) >= self.batch_size:
logger.info(
f"Yielding batch of {len(slim_docs)} slim documents"
)
yield slim_docs
slim_docs = []
if callback and callback.should_stop():
return
if callback:
callback.progress("retrieve_all_slim_docs", 1)
except Exception as e:
logger.error(
f"Error processing page ID {page_id} for slim documents: {e}"
)
# Process spaces if include_all_spaces is True or spaces are provided
if self.include_all_spaces or self.spaces:
logger.info("Processing spaces for slim documents")
# Get spaces to process
spaces_to_process = []
if self.include_all_spaces:
logger.info("Fetching all spaces for slim documents")
# Fetch all spaces
all_space_ids = self._get_space_ids()
spaces_to_process = all_space_ids
logger.info(f"Found {len(spaces_to_process)} spaces to process")
else:
logger.info(f"Using provided spaces: {self.spaces}")
# Use provided spaces
spaces_to_process = [int(space_id.strip()) for space_id in self.spaces]
# Process each space
for space_id in spaces_to_process:
logger.info(f"Processing space ID: {space_id}")
# Get pages for the current space, filtered by start time if provided
pages = self._get_pages_for_space(space_id, modified_after=start)
# Process each page
for page in pages:
logger.info(f"Processing page: {page.title} (ID: {page.id})")
# Skip pages outside the time range
if end and page.lastModified >= end:
logger.info(
f"Skipping page {page.id} - outside time range (after end)"
)
continue
# Create slim document for the page
page_url = build_drupal_wiki_document_id(self.base_url, page.id)
slim_docs.append(
SlimDocument(
id=page_url,
)
)
logger.info(f"Added slim document for page {page.id}")
# Process attachments for this page
attachments = self._get_page_attachments(page.id)
for attachment in attachments:
if self._validate_attachment_filetype(attachment):
attachment_url = f"{page_url}#attachment-{attachment['id']}"
slim_docs.append(
SlimDocument(
id=attachment_url,
)
)
logger.info(
f"Added slim document for attachment {attachment['id']}"
)
# Yield batch if it reaches the batch size
if len(slim_docs) >= self.batch_size:
logger.info(
f"Yielding batch of {len(slim_docs)} slim documents"
)
yield slim_docs
slim_docs = []
if callback and callback.should_stop():
return
if callback:
callback.progress("retrieve_all_slim_docs", 1)
# Yield remaining documents
if slim_docs:
logger.info(f"Yielding final batch of {len(slim_docs)} slim documents")
yield slim_docs
def validate_connector_settings(self) -> None:
"""
Validate the connector settings.
Raises:
ConnectorValidationError: If the settings are invalid.
"""
if not self.headers:
raise ConnectorMissingCredentialError("Drupal Wiki")
try:
# Try to fetch spaces to validate the connection
# Call the new helper which returns the list of space ids
self._get_space_ids()
except requests.exceptions.RequestException as e:
raise ConnectorValidationError(f"Failed to connect to Drupal Wiki: {e}")
def _is_page_in_time_range(
self,
last_modified: int,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
) -> bool:
"""
Check if a page's last modified timestamp falls within the specified time range.
Args:
last_modified: The page's last modified timestamp.
start: Start time as seconds since Unix epoch (inclusive).
end: End time as seconds since Unix epoch (exclusive).
Returns:
True if the page is within the time range, False otherwise.
"""
return (not start or last_modified >= start) and (
not end or last_modified < end
)

View File

@@ -0,0 +1,75 @@
from enum import Enum
from typing import Generic
from typing import List
from typing import Optional
from typing import TypeVar
from pydantic import BaseModel
from onyx.connectors.interfaces import ConnectorCheckpoint
class SpaceAccessStatus(str, Enum):
"""Enum for Drupal Wiki space access status"""
PRIVATE = "PRIVATE"
ANONYMOUS = "ANONYMOUS"
AUTHENTICATED = "AUTHENTICATED"
class DrupalWikiSpace(BaseModel):
"""Model for a Drupal Wiki space"""
id: int
name: str
type: str
description: Optional[str] = None
accessStatus: Optional[SpaceAccessStatus] = None
color: Optional[str] = None
class DrupalWikiPage(BaseModel):
"""Model for a Drupal Wiki page"""
id: int
title: str
homeSpace: int
lastModified: int
type: str
body: Optional[str] = None
T = TypeVar("T")
class DrupalWikiBaseResponse(BaseModel, Generic[T]):
"""Base model for Drupal Wiki API responses"""
totalPages: int
totalElements: int
size: int
content: List[T]
number: int
first: bool
last: bool
numberOfElements: int
empty: bool
class DrupalWikiSpaceResponse(DrupalWikiBaseResponse[DrupalWikiSpace]):
"""Model for the response from the Drupal Wiki spaces API"""
class DrupalWikiPageResponse(DrupalWikiBaseResponse[DrupalWikiPage]):
"""Model for the response from the Drupal Wiki pages API"""
class DrupalWikiCheckpoint(ConnectorCheckpoint):
"""Checkpoint for the Drupal Wiki connector"""
current_space_index: int = 0
current_page_index: int = 0
current_page_id_index: int = 0
spaces: List[int] = []
page_ids: List[int] = []
is_processing_specific_pages: bool = False

View File

@@ -0,0 +1,20 @@
from datetime import datetime
from datetime import timezone
from onyx.utils.logger import setup_logger
logger = setup_logger()
def build_drupal_wiki_document_id(base_url: str, page_id: int) -> str:
"""Build a document ID for a Drupal Wiki page using the real URL format"""
# Ensure base_url ends with a slash
if not base_url.endswith("/"):
base_url += "/"
return f"{base_url}node/{page_id}"
def datetime_from_timestamp(timestamp: int) -> datetime:
"""Convert a Unix timestamp to a datetime object in UTC"""
return datetime.fromtimestamp(timestamp, tz=timezone.utc)

View File

@@ -196,6 +196,10 @@ CONNECTOR_CLASS_MAP = {
module_path="onyx.connectors.highspot.connector",
class_name="HighspotConnector",
),
DocumentSource.DRUPAL_WIKI: ConnectorMapping(
module_path="onyx.connectors.drupal_wiki.connector",
class_name="DrupalWikiConnector",
),
DocumentSource.IMAP: ConnectorMapping(
module_path="onyx.connectors.imap.connector",
class_name="ImapConnector",

BIN
web/public/DrupalWiki.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 480 KiB

View File

@@ -17,6 +17,9 @@ import {
} from "@/components/ui/fully_wrapped_tabs";
import { useFormikContext } from "formik";
// Define a general type for form values
type FormValues = Record<string, any>;
interface TabsFieldProps {
tabField: TabOption;
values: any;
@@ -30,6 +33,22 @@ const TabsField: FC<TabsFieldProps> = ({
connector,
currentCredential,
}) => {
const { setFieldValue } = useFormikContext<FormValues>(); // Initialize the tab value if not set
useEffect(() => {
if (!values[tabField.name] && tabField.tabs.length > 0) {
setFieldValue(
tabField.name,
tabField.defaultTab || tabField.tabs[0]?.value
);
}
}, [
tabField.name,
tabField.defaultTab,
tabField.tabs,
setFieldValue,
values,
]);
return (
<div className="w-full">
{tabField.label && (
@@ -54,16 +73,19 @@ const TabsField: FC<TabsFieldProps> = ({
<div className="text-sm text-muted-foreground">No tabs to display.</div>
) : (
<Tabs
defaultValue={tabField.tabs[0]?.value} // Optional chaining for safety, though the length check above handles it
value={values[tabField.name] || tabField.tabs[0]?.value} // Use current form value or default
className="w-full"
onValueChange={(newTab) => {
// Update the tab field value
setFieldValue(tabField.name, newTab);
// Clear values from other tabs but preserve defaults
tabField.tabs.forEach((tab) => {
if (tab.value !== newTab) {
tab.fields.forEach((field) => {
// Only clear if not default value
if (values[field.name] !== field.default) {
values[field.name] = field.default;
setFieldValue(field.name, field.default);
}
});
}
@@ -128,7 +150,7 @@ export const RenderField: FC<RenderFieldProps> = ({
connector,
currentCredential,
}) => {
const { setFieldValue } = useFormikContext<any>(); // Get Formik's context functions
const { setFieldValue } = useFormikContext<FormValues>(); // Get Formik's context functions
const label =
typeof field.label === "function"

View File

@@ -37,6 +37,7 @@ import discordIcon from "@public/discord.png";
import discourseIcon from "@public/Discourse.png";
import document360Icon from "@public/Document360.png";
import dropboxIcon from "@public/Dropbox.png";
import drupalwikiIcon from "@public/DrupalWiki.png";
import egnyteIcon from "@public/Egnyte.png";
import firefliesIcon from "@public/Fireflies.png";
import freshdeskIcon from "@public/Freshdesk.png";
@@ -834,6 +835,7 @@ export const DeepseekIcon = createLogoIcon(deepseekSVG);
export const DiscourseIcon = createLogoIcon(discourseIcon);
export const Document360Icon = createLogoIcon(document360Icon);
export const DropboxIcon = createLogoIcon(dropboxIcon);
export const DrupalWikiIcon = createLogoIcon(drupalwikiIcon);
export const EgnyteIcon = createLogoIcon(egnyteIcon);
export const FirefliesIcon = createLogoIcon(firefliesIcon);
export const FreshdeskIcon = createLogoIcon(freshdeskIcon);

View File

@@ -875,6 +875,79 @@ export const connectorConfigs: Record<
],
advanced_values: [],
},
drupal_wiki: {
description: "Configure Drupal Wiki connector",
values: [
{
type: "text",
query: "Enter the base URL of the Drupal Wiki instance:",
label: "Base URL",
name: "base_url",
optional: false,
description:
"The base URL of your Drupal Wiki instance (e.g., https://help.drupal-wiki.com )",
},
{
type: "tab",
name: "drupal_wiki_scope",
label: "What should we index from Drupal Wiki?",
optional: true,
tabs: [
{
value: "all_spaces",
label: "All Spaces",
fields: [
{
type: "string_tab",
label: "All Spaces",
name: "all_spaces_description",
description:
"This connector will index all spaces the provided credentials have access to!",
},
],
},
{
value: "specific_spaces",
label: "Specific Spaces and/or Pages",
fields: [
{
type: "list",
query: "Enter space IDs to include:",
label: "Space IDs",
name: "spaces",
description:
"Specify one or more space IDs to index. You can include spaces even if you also specify page IDs below. Only numeric values are allowed.",
optional: true,
transform: (values) =>
values.filter((value) => /^\d+$/.test(value.trim())),
},
{
type: "list",
query: "Enter page IDs to include:",
label: "Page IDs",
name: "pages",
description:
"Specify one or more page IDs to index. You can include specific pages even if you also specify spaces above. Only numeric values are allowed.",
optional: true,
transform: (values) =>
values.filter((value) => /^\d+$/.test(value.trim())),
},
],
},
],
},
{
type: "checkbox",
query: "Include attachments?",
label: "Include Attachments",
name: "include_attachments",
description:
"Enable processing of page attachments including images and documents",
default: false,
},
],
advanced_values: [],
},
axero: {
description: "Configure Axero connector",
values: [
@@ -1802,6 +1875,15 @@ export interface AxeroConfig {
spaces?: string[];
}
export interface DrupalWikiConfig {
base_url: string;
spaces?: string[];
pages?: string[];
include_all_spaces?: boolean;
drupal_wiki_scope?: string;
include_attachments?: boolean;
}
export interface TeamsConfig {
teams?: string[];
}

View File

@@ -266,6 +266,10 @@ export interface HighspotCredentialJson {
highspot_secret: string;
}
export interface DrupalWikiCredentialJson {
drupal_wiki_api_token: string;
}
export interface ImapCredentialJson {
imap_username: string;
imap_password: string;
@@ -448,6 +452,9 @@ export const credentialTemplates: Record<ValidSources, any> = {
airtable: {
airtable_access_token: "",
} as AirtableCredentialJson,
drupal_wiki: {
drupal_wiki_api_token: "",
} as DrupalWikiCredentialJson,
xenforo: null,
google_sites: null,
file: null,
@@ -641,6 +648,9 @@ export const credentialDisplayNames: Record<string, string> = {
highspot_key: "Highspot Key",
highspot_secret: "Highspot Secret",
// Drupal Wiki
drupal_wiki_api_token: "Drupal Wiki Personal Access Token",
// Bitbucket
bitbucket_email: "Bitbucket Account Email",
bitbucket_api_token: "Bitbucket API Token",

View File

@@ -1,2 +1,2 @@
- Generated Files
* Generated files live here. This directory should be git ignored.
* Generated files live here. This directory should be git ignored.

View File

@@ -44,6 +44,7 @@ import {
AirtableIcon,
GitbookIcon,
HighspotIcon,
DrupalWikiIcon,
EmailIcon,
TestRailIcon,
} from "@/components/icons/icons";
@@ -300,6 +301,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Messaging,
docs: `${DOCS_ADMINS_PATH}/connectors/official/gmail/overview`,
},
drupal_wiki: {
icon: DrupalWikiIcon,
displayName: "Drupal Wiki",
category: SourceCategory.Wiki,
docs: `${DOCS_ADMINS_PATH}/connectors/official/drupal_wiki`,
},
imap: {
icon: EmailIcon,
displayName: "Email",

View File

@@ -501,6 +501,7 @@ export enum ValidSources {
Airtable = "airtable",
Gitbook = "gitbook",
Highspot = "highspot",
DrupalWiki = "drupal_wiki",
Imap = "imap",
Bitbucket = "bitbucket",
TestRail = "testrail",