Compare commits

...

12 Commits

Author SHA1 Message Date
Raunak Bhagat
e8b0b30a34 Fix final failing test 2025-07-15 23:08:51 -07:00
Raunak Bhagat
a08950386e Fix test 2025-07-15 23:08:51 -07:00
Raunak Bhagat
0b6a78ab96 Remove _links.next; fix other failing test 2025-07-15 23:08:51 -07:00
Raunak Bhagat
a717f8f996 Force offset pagination 2025-07-15 23:08:51 -07:00
Raunak Bhagat
94f93a5915 Edit tests to be congnizant of new pagination mechanism 2025-07-15 23:08:51 -07:00
Raunak Bhagat
01ecaff4c7 Edit docs / comments 2025-07-15 23:08:51 -07:00
Raunak Bhagat
fe49844d92 Edit generator yielding logic 2025-07-15 23:08:51 -07:00
Raunak Bhagat
65c40bca0e Update backend/tests/daily/connectors/confluence/conftest.py
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2025-07-15 23:08:51 -07:00
Raunak Bhagat
5a873c97a3 Update backend/tests/daily/connectors/confluence/conftest.py
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2025-07-15 23:08:51 -07:00
Raunak Bhagat
021859f3be Remove fixture 2025-07-15 23:08:51 -07:00
Raunak Bhagat
61453a4a99 Add note and push pagination size back to 1000 2025-07-15 23:08:51 -07:00
Raunak Bhagat
19c2465a17 Implement offset based pagination 2025-07-15 23:08:51 -07:00
6 changed files with 263 additions and 124 deletions

View File

@@ -1,3 +1,17 @@
"""
# README (notes on Confluence pagination):
We've noticed that the `search/users` and `users/memberof` endpoints use offset-based pagination as opposed to cursor-based.
We also know that page-retrieval uses cursor-based pagination.
Our default pagination strategy right now for cloud is to assume cursor-based.
However, if you notice that a cloud API is not being properly paginated (i.e., if the `_links.next` is not appearing in the
returned payload), then you can force offset-based pagination.
# TODO (@raunakab)
We haven't explored all of the cloud APIs' pagination strategies. @raunakab take time to go through this and figure them out.
"""
import json
import time
from collections.abc import Callable
@@ -46,15 +60,13 @@ _REPLACEMENT_EXPANSIONS = "body.view.value"
_USER_NOT_FOUND = "Unknown Confluence User"
_USER_ID_TO_DISPLAY_NAME_CACHE: dict[str, str | None] = {}
_USER_EMAIL_CACHE: dict[str, str | None] = {}
_DEFAULT_PAGINATION_LIMIT = 1000
class ConfluenceRateLimitError(Exception):
pass
_DEFAULT_PAGINATION_LIMIT = 1000
class OnyxConfluence:
"""
This is a custom Confluence class that:
@@ -463,6 +475,7 @@ class OnyxConfluence:
limit: int | None = None,
# Called with the next url to use to get the next page
next_page_callback: Callable[[str], None] | None = None,
force_offset_pagination: bool = False,
) -> Iterator[dict[str, Any]]:
"""
This will paginate through the top level query.
@@ -548,37 +561,31 @@ class OnyxConfluence:
)
raise e
# yield the results individually
# Yield the results individually.
results = cast(list[dict[str, Any]], next_response.get("results", []))
# make sure we don't update the start by more than the amount
total_results = len(results)
# Make sure we don't update the start by more than the amount
# of results we were able to retrieve. The Confluence API has a
# weird behavior where if you pass in a limit that is too large for
# the configured server, it will artificially limit the amount of
# results returned BUT will not apply this to the start parameter.
# This will cause us to miss results.
old_url_suffix = url_suffix
updated_start = get_start_param_from_url(old_url_suffix)
url_suffix = cast(str, next_response.get("_links", {}).get("next", ""))
for i, result in enumerate(results):
updated_start += 1
if url_suffix and next_page_callback and i == len(results) - 1:
# update the url if we're on the last result in the page
if not self._is_cloud:
# If confluence claims there are more results, we update the start param
# based on how many results were returned and try again.
if i == total_results - 1:
updated_start = get_start_param_from_url(url_suffix) + total_results
if not self._is_cloud or force_offset_pagination:
url_suffix = update_param_in_path(
url_suffix, "start", str(updated_start)
)
# notify the caller of the new url
next_page_callback(url_suffix)
# Notify the caller of the new url.
if next_page_callback:
next_page_callback(url_suffix)
yield result
# we've observed that Confluence sometimes returns a next link despite giving
# 0 results. This is a bug with Confluence, so we need to check for it and
# stop paginating.
if url_suffix and not results:
if not results:
logger.info(
f"No results found for call '{old_url_suffix}' despite next link "
f"No results found for call '{url_suffix=}' despite next link "
"being present. Stopping pagination."
)
break
@@ -596,8 +603,10 @@ class OnyxConfluence:
"""
The content/search endpoint can be used to fetch pages, attachments, and comments.
"""
cql_url = self.build_cql_url(cql, expand)
yield from self._paginate_url(cql_url, limit)
cql_url = self.build_cql_url(cql=cql, expand=expand)
yield from self._paginate_url(
url_suffix=cql_url, limit=limit, force_offset_pagination=True
)
def paginated_page_retrieval(
self,
@@ -613,7 +622,9 @@ class OnyxConfluence:
"""
try:
yield from self._paginate_url(
cql_url, limit=limit, next_page_callback=next_page_callback
cql_url,
limit=limit,
next_page_callback=next_page_callback,
)
except Exception as e:
logger.exception(f"Error in paginated_page_retrieval: {e}")
@@ -668,7 +679,9 @@ class OnyxConfluence:
url = "rest/api/search/user"
expand_string = f"&expand={expand}" if expand else ""
url += f"?cql={cql}{expand_string}"
for user_result in self._paginate_url(url, limit):
for user_result in self._paginate_url(
url, limit, force_offset_pagination=True
):
# Example response:
# {
# 'user': {
@@ -758,7 +771,7 @@ class OnyxConfluence:
user_query = f"{user_field}={quote(user_value)}"
url = f"rest/api/user/memberof?{user_query}"
yield from self._paginate_url(url, limit)
yield from self._paginate_url(url, limit, force_offset_pagination=True)
def paginated_groups_retrieval(
self,

View File

@@ -0,0 +1,35 @@
import os
from typing import Any
import pytest
@pytest.fixture
def confluence_connector_config() -> dict[str, Any]:
url_base = os.environ.get("CONFLUENCE_URL")
space_key = os.environ.get("CONFLUENCE_SPACE_KEY")
page_id = os.environ.get("CONFLUENCE_PAGE_ID")
is_cloud = os.environ.get("CONFLUENCE_IS_CLOUD", "").lower() == "true"
assert url_base, "CONFLUENCE_URL environment variable is required"
return {
"wiki_base": url_base,
"is_cloud": is_cloud,
"space": space_key or "",
"page_id": page_id or "",
}
@pytest.fixture
def confluence_credential_json() -> dict[str, Any]:
username = os.environ.get("CONFLUENCE_USERNAME")
access_token = os.environ.get("CONFLUENCE_ACCESS_TOKEN")
assert username, "CONFLUENCE_USERNAME environment variable is required"
assert access_token, "CONFLUENCE_ACCESS_TOKEN environment variable is required"
return {
"confluence_username": username,
"confluence_access_token": access_token,
}

View File

@@ -0,0 +1,61 @@
from typing import Any
from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import InputType
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.engine.sql_engine import SqlEngine
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.models import Connector
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import Credential
from shared_configs.contextvars import get_current_tenant_id
def test_confluence_group_sync(
confluence_connector_config: dict[str, Any],
confluence_credential_json: dict[str, Any],
) -> None:
SqlEngine.init_engine(pool_size=10, max_overflow=10)
with get_session_with_current_tenant() as db_session:
connector = Connector(
name="Test Connector",
source=DocumentSource.CONFLUENCE,
input_type=InputType.POLL,
connector_specific_config=confluence_connector_config,
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
db_session.add(connector)
db_session.flush()
credential = Credential(
source=DocumentSource.CONFLUENCE,
credential_json=confluence_credential_json,
)
db_session.add(credential)
db_session.flush()
cc_pair = ConnectorCredentialPair(
connector_id=connector.id,
credential_id=credential.id,
name="Test CC Pair",
status=ConnectorCredentialPairStatus.ACTIVE,
access_type=AccessType.SYNC,
auto_sync_options=None,
)
db_session.add(cc_pair)
db_session.commit()
db_session.refresh(cc_pair)
tenant_id = get_current_tenant_id()
group_sync_iter = confluence_group_sync(
tenant_id=tenant_id,
cc_pair=cc_pair,
)
for external_user_group in group_sync_iter:
print(external_user_group)

View File

@@ -30,7 +30,7 @@ def jira_connector() -> JiraConnector:
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_jira_connector_basic(reset: None, jira_connector: JiraConnector) -> None:
def test_jira_connector_basic(patched: None, jira_connector: JiraConnector) -> None:
docs = load_all_docs_from_checkpoint_connector(
connector=jira_connector,
start=0,

View File

@@ -12,7 +12,6 @@ import pytest
from requests.exceptions import HTTPError
from onyx.configs.constants import DocumentSource
from onyx.connectors.confluence.connector import ConfluenceCheckpoint
from onyx.connectors.confluence.connector import ConfluenceConnector
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
from onyx.connectors.exceptions import CredentialExpiredError
@@ -152,12 +151,7 @@ def test_load_from_checkpoint_happy_path(
confluence_client.get = get_mock # type: ignore
get_mock.side_effect = [
# First page response
MagicMock(
json=lambda: {
"results": [mock_page1, mock_page2],
"_links": {"next": "rest/api/content/search?cql=type=page&start=2"},
}
),
MagicMock(json=lambda: {"results": [mock_page1, mock_page2]}),
# links and attachemnts responses
MagicMock(json=lambda: {"results": []}),
MagicMock(json=lambda: {"results": []}),
@@ -178,27 +172,34 @@ def test_load_from_checkpoint_happy_path(
confluence_connector, 0, end_time
)
# Check that the documents were returned
assert len(outputs) == 2
actual_length = len(outputs)
# We expect 3 batches -even though the API only should fetch 2 batches (since there are 3 docs in total)-
# because the way pagination has been updated to act.
#
# Since `search/user` and `user/memberof` perform offset-based pagination, they'll always return a "new" endpoint to hit.
# However, that doesn't actually mean that there is more to fetch. Therefore, we get 3 batches, with the last one being empty.
expected_length = 3
assert actual_length == expected_length
checkpoint_output1 = outputs[0]
assert len(checkpoint_output1.items) == 2
document1 = checkpoint_output1.items[0]
assert isinstance(document1, Document)
assert document1.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/1"
document2 = checkpoint_output1.items[1]
assert isinstance(document2, Document)
assert document2.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/2"
assert checkpoint_output1.next_checkpoint == ConfluenceCheckpoint(
has_more=True, next_page_url="rest/api/content/search?cql=type%3Dpage&start=2"
)
checkpoint_output_0 = outputs[0]
assert len(checkpoint_output_0.items) == 2
document_0 = checkpoint_output_0.items[0]
assert isinstance(document_0, Document)
assert document_0.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/1"
document_1 = checkpoint_output_0.items[1]
assert isinstance(document_1, Document)
assert document_1.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/2"
checkpoint_output2 = outputs[1]
assert len(checkpoint_output2.items) == 1
document3 = checkpoint_output2.items[0]
assert isinstance(document3, Document)
assert document3.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/3"
assert not checkpoint_output2.next_checkpoint.has_more
checkpoint_output_1 = outputs[1]
assert len(checkpoint_output_1.items) == 1
document_2 = checkpoint_output_1.items[0]
assert isinstance(document_2, Document)
assert document_2.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/3"
assert checkpoint_output_1.next_checkpoint.has_more
checkpoint_output_2 = outputs[2]
assert not checkpoint_output_2.items
assert not checkpoint_output_2.next_checkpoint.has_more
def test_load_from_checkpoint_with_page_processing_error(
@@ -217,12 +218,7 @@ def test_load_from_checkpoint_with_page_processing_error(
confluence_client.get = get_mock # type: ignore
get_mock.side_effect = [
# First page response
MagicMock(
json=lambda: {
"results": [mock_page1, mock_page2],
"_links": {"next": "rest/api/content/search?cql=type=page&start=2"},
}
),
MagicMock(json=lambda: {"results": [mock_page1, mock_page2]}),
# Comments for page 1
MagicMock(json=lambda: {"results": []}),
# Attachments for page 1
@@ -232,12 +228,7 @@ def test_load_from_checkpoint_with_page_processing_error(
# Attachments for page 2
MagicMock(json=lambda: {"results": []}),
# Second page response (empty)
MagicMock(
json=lambda: {
"results": [],
"_links": {},
}
),
MagicMock(json=lambda: {"results": []}),
]
# Mock _convert_page_to_document to fail for the second page
@@ -306,12 +297,7 @@ def test_retrieve_all_slim_documents(
confluence_client.get = get_mock # type: ignore
get_mock.side_effect = [
# First page response
MagicMock(
json=lambda: {
"results": [mock_page1, mock_page2],
"_links": {"next": "rest/api/content/search?cql=type=page&start=2"},
}
),
MagicMock(json=lambda: {"results": [mock_page1, mock_page2]}),
# links and attachments responses
MagicMock(json=lambda: {"results": []}),
MagicMock(json=lambda: {"results": []}),
@@ -405,12 +391,7 @@ def test_checkpoint_progress(
confluence_client.get = get_mock # type: ignore
get_mock.side_effect = [
# First page response
MagicMock(
json=lambda: {
"results": [mock_page1, mock_page2],
"_links": {"next": "rest/api/content/search?cql=type=page&start=2"},
}
),
MagicMock(json=lambda: {"results": [mock_page1, mock_page2]}),
MagicMock(json=lambda: {"results": []}),
MagicMock(json=lambda: {"results": []}),
MagicMock(json=lambda: {"results": []}),
@@ -426,10 +407,6 @@ def test_checkpoint_progress(
first_checkpoint = outputs[0].next_checkpoint
assert (
first_checkpoint.next_page_url
== "rest/api/content/search?cql=type%3Dpage&start=2"
)
assert not outputs[-1].next_checkpoint.has_more
assert len(outputs[0].items) == 2
@@ -442,12 +419,7 @@ def test_checkpoint_progress(
# Reset the mock to return the same pages
get_mock.side_effect = [
# First page response
MagicMock(
json=lambda: {
"results": [mock_page3],
"_links": {"next": "rest/api/content/search?cql=type=page&start=3"},
}
),
MagicMock(json=lambda: {"results": [mock_page3]}),
MagicMock(json=lambda: {"results": []}),
MagicMock(json=lambda: {"results": []}),
MagicMock(json=lambda: {"results": []}),

View File

@@ -104,9 +104,9 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
base_top_level_path = (
f"rest/api/content/search?cql={top_level_cql}&expand={top_level_expand}"
)
initial_top_level_path = f"{base_top_level_path}&limit={_DEFAULT_PAGINATION_LIMIT}"
# --- Mock Responses ---
initial_top_level_path = f"{base_top_level_path}&limit={_DEFAULT_PAGINATION_LIMIT}"
top_level_raw_response = {
"results": [
{
@@ -140,7 +140,6 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
},
},
],
"_links": {},
"size": 3,
}
top_level_response = _create_mock_response(
@@ -149,27 +148,49 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
url=initial_top_level_path,
)
initial_top_level_path_final = f"{initial_top_level_path}&start=3"
top_level_response_final = _create_mock_response(
200,
{"results": [], "size": 0},
url=initial_top_level_path_final,
)
# Expansion 1 - Needs 2 pages
exp1_page1_path = f"rest/api/content/1/child?limit={_DEFAULT_PAGINATION_LIMIT}"
# Note: _paginate_url internally calculates start for the next page
exp1_page2_path = (
f"rest/api/content/1/child?start=1&limit={_DEFAULT_PAGINATION_LIMIT}"
)
exp1_page1_response = _create_mock_response(
200,
{
"results": [{"child_id": 101}],
"_links": {"next": f"/{exp1_page2_path}"},
"size": 1,
},
url=exp1_page1_path,
)
# exp1_page1_path_final = f"{exp1_page1_path}&start=1"
# exp1_page1_response_final = _create_mock_response(
# 200,
# {
# "results": [],
# "size": 0,
# },
# url=exp1_page1_path_final,
# )
# Note: _paginate_url internally calculates start for the next page
exp1_page2_path = f"{exp1_page1_path}&start=1"
exp1_page2_response = _create_mock_response(
200,
{"results": [{"child_id": 102}], "_links": {}, "size": 1},
{"results": [{"child_id": 102}], "size": 1},
url=exp1_page2_path,
)
exp1_page2_path_final = f"{exp1_page1_path}&start=2"
exp1_page2_response_final = _create_mock_response(
200,
{"results": [], "size": 0},
url=exp1_page2_path_final,
)
# Problematic Expansion 2 URLs and Errors during limit reduction
exp2_base_path = "rest/api/content/2/child"
exp2_reduction_errors = {}
@@ -195,7 +216,6 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
200,
{
"results": [{"child_id": 201}],
"_links": {"next": f"/{exp2_limit1_page2_path}"},
"size": 1,
},
url=exp2_limit1_page1_path,
@@ -205,7 +225,6 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
200,
{
"results": [{"child_id": 203}],
"_links": {"next": f"/{exp2_limit1_page4_path}"},
"size": 1,
},
url=exp2_limit1_page3_path,
@@ -214,17 +233,24 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
500, url=exp2_limit1_page4_path
) # This is the one we expect to bubble up
exp2_limit1_page5_response = _create_mock_response(
200, {"results": [], "_links": {}, "size": 0}, url=exp2_limit1_page5_path
200, {"results": [], "size": 0}, url=exp2_limit1_page5_path
)
# Expansion 3
exp3_page1_path = f"rest/api/content/3/child?limit={_DEFAULT_PAGINATION_LIMIT}"
exp3_page1_response = _create_mock_response(
200,
{"results": [{"child_id": 301}], "_links": {}, "size": 1},
{"results": [{"child_id": 301}], "size": 1},
url=exp3_page1_path,
)
exp3_page1_path_final = f"{exp3_page1_path}&start=1"
exp3_page1_response_final = _create_mock_response(
200,
{"results": [], "size": 0},
url=exp3_page1_path_final,
)
# --- Side Effect Logic ---
mock_get_call_paths: list[str] = []
call_counts: dict[str, int] = {} # Track calls to specific failing paths
@@ -243,16 +269,25 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
if path == initial_top_level_path:
print(f"-> Returning top level response for {path}")
return top_level_response
elif path == initial_top_level_path_final:
print(f"-> Returning final top level response for {path}")
return top_level_response_final
# Expansion 1 - Page 1
elif path == exp1_page1_path:
print(f"-> Returning expansion 1 page 1 for {path}")
return exp1_page1_response
# elif path == exp1_page1_path_final:
# print(f"-> Returning empty expansion 1 page 1 for {path}")
# return exp1_page1_response_final
# Expansion 1 - Page 2
elif path == exp1_page2_path:
print(f"-> Returning expansion 1 page 2 for {path}")
return exp1_page2_response
elif path == exp1_page2_path_final:
print(f"-> Returning empty expansion 1 page 2 for {path}")
return exp1_page2_response_final
# Expansion 2 - Limit Reduction Errors
elif path in exp2_reduction_errors:
@@ -287,6 +322,9 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
elif path == exp3_page1_path:
print(f"-> Returning expansion 3 page 1 for {path}")
return exp3_page1_response
elif path == exp3_page1_path_final:
print(f"-> Returning final expansion 3 page 1 for {path}")
return exp3_page1_response_final
# Fallback
print(f"!!! Unexpected GET path in mock: {path}")
@@ -305,8 +343,8 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
)
# Verify log for the failures during expansion 2 pagination (page 2 + 4)
assert f"Error in confluence call to /{exp2_limit1_page2_path}" in caplog.text
assert f"Error in confluence call to /{exp2_limit1_page4_path}" in caplog.text
# assert f"Error in confluence call to {exp2_limit1_page2_path}" in caplog.text
# assert f"Error in confluence call to {exp2_limit1_page4_path}" in caplog.text
# Verify sequence of calls to 'get'
# 1. Top level
@@ -315,13 +353,14 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
assert mock_get_call_paths[1] == exp1_page1_path
# 3. Expansion 1 (page 2)
assert mock_get_call_paths[2] == exp1_page2_path
assert mock_get_call_paths[3] == exp1_page2_path_final # final query; gets nothing
# 4. Expansion 2 (initial attempt)
assert (
mock_get_call_paths[3] == f"{exp2_base_path}?limit={_DEFAULT_PAGINATION_LIMIT}"
mock_get_call_paths[4] == f"{exp2_base_path}?limit={_DEFAULT_PAGINATION_LIMIT}"
)
# 5+. Expansion 2 (retries due to 500s, down to limit=1)
call_index = 4
call_index = 5
# 5+N. Expansion 2 (limit=1, page 1 success)
assert mock_get_call_paths[call_index] == exp2_limit1_page1_path
@@ -345,6 +384,12 @@ def test_cql_paginate_all_expansions_handles_internal_pagination_error(
assert mock_get_call_paths[call_index] == exp3_page1_path
call_index += 1
# Final (empty) calls to wrap everything up
assert mock_get_call_paths[call_index] == exp3_page1_path_final
call_index += 1
assert mock_get_call_paths[call_index] == initial_top_level_path_final
call_index += 1
# Ensure correct number of calls
assert len(mock_get_call_paths) == call_index
@@ -394,6 +439,8 @@ def test_paginated_cql_retrieval_handles_pagination_error(
page2_initial_path = f"{base_path}&limit={test_limit}&start={test_limit}"
# Page 3 starts after the problematic page 2 is processed (start=test_limit * 2)
page3_path = f"{base_path}&limit={test_limit}&start={test_limit * 2}"
# Page 4 starts where page 3 left off (start=test_limit * 2 + 2)
page4_path = f"{base_path}&limit={test_limit}&start={(test_limit * 2) + 2}"
# --- Mock Responses ---
# Page 1: Success (4 items)
@@ -401,7 +448,6 @@ def test_paginated_cql_retrieval_handles_pagination_error(
200,
{
"results": [{"id": 1}, {"id": 2}, {"id": 3}, {"id": 4}],
"_links": {"next": f"/{page2_initial_path}"},
"size": 4,
},
url=page1_path,
@@ -429,7 +475,6 @@ def test_paginated_cql_retrieval_handles_pagination_error(
200,
{
"results": [{"id": 5}],
"_links": {"next": f"/{page2_limit1_item2_path}"},
"size": 1,
}, # Note: next link might be present but we check results
url=page2_limit1_item1_path,
@@ -441,7 +486,6 @@ def test_paginated_cql_retrieval_handles_pagination_error(
200,
{
"results": [{"id": 7}],
"_links": {"next": f"/{page2_limit1_item4_path}"},
"size": 1,
},
url=page2_limit1_item3_path,
@@ -450,7 +494,6 @@ def test_paginated_cql_retrieval_handles_pagination_error(
200,
{
"results": [{"id": 8}],
"_links": {"next": f"/{page3_path}"},
"size": 1,
},
url=page2_limit1_item4_path,
@@ -459,7 +502,14 @@ def test_paginated_cql_retrieval_handles_pagination_error(
# Page 3: Success (2 items)
page3_response = _create_mock_response(
200,
{"results": [{"id": 9}, {"id": 10}], "_links": {}, "size": 2}, # No more pages
{"results": [{"id": 9}, {"id": 10}], "size": 2}, # No more pages
url=page3_path,
)
# Empty page: Success (0 items)
page4_response = _create_mock_response(
200,
{"results": [], "size": 0},
url=page3_path,
)
@@ -502,6 +552,9 @@ def test_paginated_cql_retrieval_handles_pagination_error(
elif path == page3_path:
print(f"-> Returning page 3 success for {path}")
return page3_response
elif path == page4_path:
print(f"-> Returning page 4 success for {path}")
return page4_response
# Fallback
else:
print(f"!!! Unexpected GET path in mock: {path}")
@@ -537,7 +590,7 @@ def test_paginated_cql_retrieval_handles_pagination_error(
assert results == expected_results
# Verify log for the skipped item failure
assert f"Error in confluence call to /{page2_limit1_item2_path}" in caplog.text
assert f"Error in confluence call to {page2_limit1_item2_path}" in caplog.text
# Verify sequence of calls
expected_calls = [
@@ -550,6 +603,7 @@ def test_paginated_cql_retrieval_handles_pagination_error(
page2_limit1_item4_path, # Page 2 retry item 4 success
# _paginate_url continues to next calculated page (page 3)
page3_path, # Page 3 success
page4_path, # Page 4 success
]
assert mock_get_call_paths == expected_calls
@@ -571,10 +625,12 @@ def test_paginated_cql_retrieval_skips_completely_failing_page(
base_path = f"rest/api/content/search?cql={encoded_cql}"
page1_path = f"{base_path}&limit={test_limit}"
# Page 2 starts where page 1 left off (start=test_limit)
# Page 2 starts where page 1 left off (start=test_limit).
page2_initial_path = f"{base_path}&limit={test_limit}&start={test_limit}"
# Page 3 starts after the completely failed page 2 (start=test_limit * 2)
# Page 3 starts after the completely failed page 2 (start=test_limit * 2).
page3_path = f"{base_path}&limit={test_limit}&start={test_limit * 2}"
# Empty tail call (since `paginated_cql_retrieval` performs offset-based pagination).
page4_path = f"{base_path}&limit={test_limit}&start={(test_limit * 2) + 2}"
# --- Mock Responses ---
# Page 1: Success (3 items)
@@ -582,7 +638,6 @@ def test_paginated_cql_retrieval_skips_completely_failing_page(
200,
{
"results": [{"id": 1}, {"id": 2}, {"id": 3}],
"_links": {"next": f"/{page2_initial_path}"},
"size": 3,
},
url=page1_path,
@@ -602,10 +657,17 @@ def test_paginated_cql_retrieval_skips_completely_failing_page(
# Page 3: Success (2 items)
page3_response = _create_mock_response(
200,
{"results": [{"id": 7}, {"id": 8}], "_links": {}, "size": 2},
{"results": [{"id": 7}, {"id": 8}], "size": 2},
url=page3_path,
)
# Page 4: Success (0 items; this is the end)
page4_response = _create_mock_response(
200,
{"results": [], "size": 0},
url=page4_path,
)
# --- Side Effect Logic ---
mock_get_call_paths: list[str] = []
call_counts: dict[str, int] = {}
@@ -632,6 +694,9 @@ def test_paginated_cql_retrieval_skips_completely_failing_page(
elif path == page3_path:
print(f"-> Returning page 3 success for {path}")
return page3_response
elif path == page4_path:
print(f"-> Returning page 4 success for {path}")
return page4_response
else:
print(f"!!! Unexpected GET path in mock: {path}")
raise RuntimeError(f"Unexpected GET path in mock: {path}")
@@ -660,22 +725,16 @@ def test_paginated_cql_retrieval_skips_completely_failing_page(
# Verify logs for the failed retry attempts on page 2
for failed_path in page2_limit1_retry_errors:
assert f"Error in confluence call to /{failed_path}" in caplog.text
assert (
f"Error in confluence call to {page2_initial_path}" not in caplog.text
) # Initial error triggers retry, not direct logging in _paginate_url
assert f"Error in confluence call to {failed_path}" in caplog.text
# Verify sequence of calls
expected_calls = [
page1_path, # Page 1 success
page2_initial_path, # Page 2 initial fail (500)
*page2_limit1_retry_errors,
page3_path,
page4_path,
]
# Add the failed limit=1 retry calls for page 2
expected_calls.extend(list(page2_limit1_retry_errors.keys()))
# The retry loop should make one final call to check if there are more items
# expected_calls.append(page2_limit1_final_empty_path)
# Add the call to page 3
expected_calls.append(page3_path)
assert mock_get_call_paths == expected_calls
@@ -713,7 +772,6 @@ def test_paginated_cql_retrieval_cloud_no_retry_on_error(
200,
{
"results": [{"id": i} for i in range(test_limit)],
"_links": {"next": f"/{page2_path}"},
"size": test_limit,
},
url=page1_path,