fix typing

address nits
k
2026-02-18 16:25:45 +00:00 · 2025-03-30 16:46:55 -07:00 · 2025-03-30 14:43:46 -07:00 · 2025-03-30 13:45:38 -07:00 · 2025-03-30 13:41:46 -07:00 · 2025-03-30 13:40:40 -07:00
51 changed files with 120 additions and 1120 deletions
--- a/backend/alembic/versions/8e1ac4f39a9f_enable_contextual_retrieval.py
+++ b/backend/alembic/versions/8e1ac4f39a9f_enable_contextual_retrieval.py
@@ -1,50 +0,0 @@
-"""enable contextual retrieval
-
-Revision ID: 8e1ac4f39a9f
-Revises: 3781a5eb12cb
-Create Date: 2024-12-20 13:29:09.918661
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "8e1ac4f39a9f"
-down_revision = "3781a5eb12cb"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "enable_contextual_rag",
-            sa.Boolean(),
-            nullable=False,
-            server_default="false",
-        ),
-    )
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "contextual_rag_llm_name",
-            sa.String(),
-            nullable=True,
-        ),
-    )
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "contextual_rag_llm_provider",
-            sa.String(),
-            nullable=True,
-        ),
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("search_settings", "enable_contextual_rag")
-    op.drop_column("search_settings", "contextual_rag_llm_name")
-    op.drop_column("search_settings", "contextual_rag_llm_provider")
--- a/backend/ee/onyx/server/tenants/provisioning.py
+++ b/backend/ee/onyx/server/tenants/provisioning.py
@@ -94,7 +94,6 @@ async def get_or_provision_tenant(
        # Notify control plane if we have created / assigned a new tenant
        if not DEV_MODE:
            await notify_control_plane(tenant_id, email, referral_source)
-
        return tenant_id

    except Exception as e:
--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -56,6 +56,7 @@ from httpx_oauth.oauth2 import OAuth2Token
 from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession

+from ee.onyx.configs.app_configs import ANONYMOUS_USER_COOKIE_NAME
 from onyx.auth.api_key import get_hashed_api_key_from_request
 from onyx.auth.email_utils import send_forgot_password_email
 from onyx.auth.email_utils import send_user_verification_email
@@ -360,8 +361,18 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                reason="Password must contain at least one special character from the following set: "
                f"{PASSWORD_SPECIAL_CHARS}."
            )
+
        return

+    async def on_after_login(
+        self,
+        user: User,
+        request: Optional[Request] = None,
+        response: Optional[Response] = None,
+    ) -> None:
+        if response:
+            response.delete_cookie(ANONYMOUS_USER_COOKIE_NAME)
+
    async def oauth_callback(
        self,
        oauth_name: str,
--- a/backend/onyx/chat/prune_and_merge.py
+++ b/backend/onyx/chat/prune_and_merge.py
@@ -153,8 +153,6 @@ def _apply_pruning(
    # remove docs that are explicitly marked as not for QA
    sections = _remove_sections_to_ignore(sections=sections)

-    section_idx_token_count: dict[int, int] = {}
-
    final_section_ind = None
    total_tokens = 0
    for ind, section in enumerate(sections):
@@ -204,20 +202,10 @@ def _apply_pruning(
            section_token_count = DOC_EMBEDDING_CONTEXT_SIZE

        total_tokens += section_token_count
-        section_idx_token_count[ind] = section_token_count
-
        if total_tokens > token_limit:
            final_section_ind = ind
            break

-    try:
-        logger.debug(f"Number of documents after pruning: {ind + 1}")
-        logger.debug("Number of tokens per document (pruned):")
-        for x, y in section_idx_token_count.items():
-            logger.debug(f"{x + 1}: {y}")
-    except Exception as e:
-        logger.error(f"Error logging prune statistics: {e}")
-
    if final_section_ind is not None:
        if is_manually_selected_docs or use_sections:
            if final_section_ind != len(sections) - 1:
@@ -374,26 +362,6 @@ def _merge_sections(sections: list[InferenceSection]) -> list[InferenceSection]:
        reverse=True,
    )

-    try:
-        num_original_sections = len(sections)
-        num_original_document_ids = len(
-            set([section.center_chunk.document_id for section in sections])
-        )
-        num_merged_sections = len(new_sections)
-        num_merged_document_ids = len(
-            set([section.center_chunk.document_id for section in new_sections])
-        )
-        logger.debug(
-            f"Merged {num_original_sections} sections from {num_original_document_ids} documents "
-            f"into {num_merged_sections} new sections in {num_merged_document_ids} documents"
-        )
-
-        logger.debug("Number of chunks per document (new ranking):")
-        for x, y in enumerate(new_sections):
-            logger.debug(f"{x + 1}: {len(y.chunks)}")
-    except Exception as e:
-        logger.error(f"Error logging merge statistics: {e}")
-
    return new_sections


--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -495,11 +495,6 @@ NUM_SECONDARY_INDEXING_WORKERS = int(
 ENABLE_MULTIPASS_INDEXING = (
    os.environ.get("ENABLE_MULTIPASS_INDEXING", "").lower() == "true"
 )
-# Enable contextual retrieval
-ENABLE_CONTEXTUAL_RAG = os.environ.get("ENABLE_CONTEXTUAL_RAG", "").lower() == "true"
-
-DEFAULT_CONTEXTUAL_RAG_LLM_NAME = "gpt-4o-mini"
-DEFAULT_CONTEXTUAL_RAG_LLM_PROVIDER = "DevEnvPresetOpenAI"
 # Finer grained chunking for more detail retention
 # Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
 # tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
@@ -541,17 +536,6 @@ MAX_FILE_SIZE_BYTES = int(
    os.environ.get("MAX_FILE_SIZE_BYTES") or 2 * 1024 * 1024 * 1024
 )  # 2GB in bytes

-# Use document summary for contextual rag
-USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
-# Use chunk summary for contextual rag
-USE_CHUNK_SUMMARY = os.environ.get("USE_CHUNK_SUMMARY", "true").lower() == "true"
-# Average summary embeddings for contextual rag (not yet implemented)
-AVERAGE_SUMMARY_EMBEDDINGS = (
-    os.environ.get("AVERAGE_SUMMARY_EMBEDDINGS", "false").lower() == "true"
-)
-
-MAX_TOKENS_FOR_FULL_INCLUSION = 4096
-
 #####
 # Miscellaneous
 #####
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -30,7 +30,6 @@ from onyx.file_processing.file_validation import is_valid_image_type
 from onyx.file_processing.image_summarization import summarize_image_with_error_handling
 from onyx.file_processing.image_utils import store_image_and_create_section
 from onyx.llm.interfaces import LLM
-from onyx.utils.lazy import lazy_eval
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
@@ -77,26 +76,6 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
    return is_valid_image_type(mime_type)


-def download_request(service: GoogleDriveService, file_id: str) -> bytes:
-    """
-    Download the file from Google Drive.
-    """
-    # For other file types, download the file
-    # Use the correct API call for downloading files
-    request = service.files().get_media(fileId=file_id)
-    response_bytes = io.BytesIO()
-    downloader = MediaIoBaseDownload(response_bytes, request)
-    done = False
-    while not done:
-        _, done = downloader.next_chunk()
-
-    response = response_bytes.getvalue()
-    if not response:
-        logger.warning(f"Failed to download {file_id}")
-        return bytes()
-    return response
-
-
 def _download_and_extract_sections_basic(
    file: dict[str, str],
    service: GoogleDriveService,
@@ -135,31 +114,41 @@ def _download_and_extract_sections_basic(

    # For other file types, download the file
    # Use the correct API call for downloading files
-    response_call = lazy_eval(lambda: download_request(service, file_id))
+    request = service.files().get_media(fileId=file_id)
+    response_bytes = io.BytesIO()
+    downloader = MediaIoBaseDownload(response_bytes, request)
+    done = False
+    while not done:
+        _, done = downloader.next_chunk()
+
+    response = response_bytes.getvalue()
+    if not response:
+        logger.warning(f"Failed to download {file_name}")
+        return []

    # Process based on mime type
    if mime_type == "text/plain":
-        text = response_call().decode("utf-8")
+        text = response.decode("utf-8")
        return [TextSection(link=link, text=text)]

    elif (
        mime_type
        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ):
-        text, _ = docx_to_text_and_images(io.BytesIO(response_call()))
+        text, _ = docx_to_text_and_images(io.BytesIO(response))
        return [TextSection(link=link, text=text)]

    elif (
        mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ):
-        text = xlsx_to_text(io.BytesIO(response_call()))
+        text = xlsx_to_text(io.BytesIO(response))
        return [TextSection(link=link, text=text)]

    elif (
        mime_type
        == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    ):
-        text = pptx_to_text(io.BytesIO(response_call()))
+        text = pptx_to_text(io.BytesIO(response))
        return [TextSection(link=link, text=text)]

    elif is_gdrive_image_mime_type(mime_type):
@@ -169,7 +158,7 @@ def _download_and_extract_sections_basic(
            with get_session_with_current_tenant() as db_session:
                section, embedded_id = store_image_and_create_section(
                    db_session=db_session,
-                    image_data=response_call(),
+                    image_data=response,
                    file_name=file_id,
                    display_name=file_name,
                    media_type=mime_type,
@@ -182,7 +171,7 @@ def _download_and_extract_sections_basic(
        return sections

    elif mime_type == "application/pdf":
-        text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_call()))
+        text, _pdf_meta, images = read_pdf_file(io.BytesIO(response))
        pdf_sections: list[TextSection | ImageSection] = [
            TextSection(link=link, text=text)
        ]
@@ -205,15 +194,8 @@ def _download_and_extract_sections_basic(

    else:
        # For unsupported file types, try to extract text
-        if mime_type in [
-            "application/vnd.google-apps.video",
-            "application/vnd.google-apps.audio",
-            "application/zip",
-        ]:
-            return []
-        # For unsupported file types, try to extract text
        try:
-            text = extract_file_text(io.BytesIO(response_call()), file_name)
+            text = extract_file_text(io.BytesIO(response), file_name)
            return [TextSection(link=link, text=text)]
        except Exception as e:
            logger.warning(f"Failed to extract text from {file_name}: {e}")
--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -163,9 +163,6 @@ class DocumentBase(BaseModel):
                attributes.append(k + INDEX_SEPARATOR + v)
        return attributes

-    def get_text_content(self) -> str:
-        return " ".join([section.text for section in self.sections if section.text])
-

 class Document(DocumentBase):
    """Used for Onyx ingestion api, the ID is required"""
--- a/backend/onyx/context/search/models.py
+++ b/backend/onyx/context/search/models.py
@@ -60,7 +60,7 @@ class SearchSettingsCreationRequest(InferenceSettings, IndexingSetting):
        inference_settings = InferenceSettings.from_db_model(search_settings)
        indexing_setting = IndexingSetting.from_db_model(search_settings)

-        return cls(**inference_settings.model_dump(), **indexing_setting.model_dump())
+        return cls(**inference_settings.dict(), **indexing_setting.dict())


 class SavedSearchSettings(InferenceSettings, IndexingSetting):
@@ -80,9 +80,6 @@ class SavedSearchSettings(InferenceSettings, IndexingSetting):
            reduced_dimension=search_settings.reduced_dimension,
            # Whether switching to this model requires re-indexing
            background_reindex_enabled=search_settings.background_reindex_enabled,
-            enable_contextual_rag=search_settings.enable_contextual_rag,
-            contextual_rag_llm_name=search_settings.contextual_rag_llm_name,
-            contextual_rag_llm_provider=search_settings.contextual_rag_llm_provider,
            # Reranking Details
            rerank_model_name=search_settings.rerank_model_name,
            rerank_provider_type=search_settings.rerank_provider_type,
@@ -221,8 +218,6 @@ class InferenceChunk(BaseChunk):
    # to specify that a set of words should be highlighted. For example:
    # ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
    match_highlights: list[str]
-    doc_summary: str
-    chunk_context: str

    # when the doc was last updated
    updated_at: datetime | None
--- a/backend/onyx/context/search/postprocessing/postprocessing.py
+++ b/backend/onyx/context/search/postprocessing/postprocessing.py
@@ -196,21 +196,9 @@ def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk
            RETURN_SEPARATOR
        )

-    def _remove_contextual_rag(chunk: InferenceChunkUncleaned) -> str:
-        # remove document summary
-        if chunk.content.startswith(chunk.doc_summary):
-            chunk.content = chunk.content[len(chunk.doc_summary) :].lstrip()
-        # remove chunk context
-        if chunk.content.endswith(chunk.chunk_context):
-            chunk.content = chunk.content[
-                : len(chunk.content) - len(chunk.chunk_context)
-            ].rstrip()
-        return chunk.content
-
    for chunk in chunks:
        chunk.content = _remove_title(chunk)
        chunk.content = _remove_metadata_suffix(chunk)
-        chunk.content = _remove_contextual_rag(chunk)

    return [chunk.to_inference_chunk() for chunk in chunks]

--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -791,15 +791,6 @@ class SearchSettings(Base):
    # Mini and Large Chunks (large chunk also checks for model max context)
    multipass_indexing: Mapped[bool] = mapped_column(Boolean, default=True)

-    # Contextual RAG
-    enable_contextual_rag: Mapped[bool] = mapped_column(Boolean, default=False)
-
-    # Contextual RAG LLM
-    contextual_rag_llm_name: Mapped[str | None] = mapped_column(String, nullable=True)
-    contextual_rag_llm_provider: Mapped[str | None] = mapped_column(
-        String, nullable=True
-    )
-
    multilingual_expansion: Mapped[list[str]] = mapped_column(
        postgresql.ARRAY(String), default=[]
    )
--- a/backend/onyx/db/search_settings.py
+++ b/backend/onyx/db/search_settings.py
@@ -62,9 +62,6 @@ def create_search_settings(
        multipass_indexing=search_settings.multipass_indexing,
        embedding_precision=search_settings.embedding_precision,
        reduced_dimension=search_settings.reduced_dimension,
-        enable_contextual_rag=search_settings.enable_contextual_rag,
-        contextual_rag_llm_name=search_settings.contextual_rag_llm_name,
-        contextual_rag_llm_provider=search_settings.contextual_rag_llm_provider,
        multilingual_expansion=search_settings.multilingual_expansion,
        disable_rerank_for_streaming=search_settings.disable_rerank_for_streaming,
        rerank_model_name=search_settings.rerank_model_name,
@@ -322,7 +319,6 @@ def get_old_default_embedding_model() -> IndexingSetting:
        passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""),
        index_name="danswer_chunk",
        multipass_indexing=False,
-        enable_contextual_rag=False,
        api_url=None,
    )

@@ -337,6 +333,5 @@ def get_new_default_embedding_model() -> IndexingSetting:
        passage_prefix=ASYM_PASSAGE_PREFIX,
        index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}",
        multipass_indexing=False,
-        enable_contextual_rag=False,
        api_url=None,
    )
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -98,12 +98,6 @@ schema DANSWER_CHUNK_NAME {
        field metadata type string {
            indexing: summary | attribute
        }
-        field chunk_context type string {
-            indexing: summary | attribute
-        }
-        field doc_summary type string {
-            indexing: summary | attribute
-        }
        field metadata_suffix type string {
            indexing: summary | attribute
        }
--- a/backend/onyx/document_index/vespa/chunk_retrieval.py
+++ b/backend/onyx/document_index/vespa/chunk_retrieval.py
@@ -24,11 +24,9 @@ from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
 from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
 from onyx.document_index.vespa_constants import BLURB
 from onyx.document_index.vespa_constants import BOOST
-from onyx.document_index.vespa_constants import CHUNK_CONTEXT
 from onyx.document_index.vespa_constants import CHUNK_ID
 from onyx.document_index.vespa_constants import CONTENT
 from onyx.document_index.vespa_constants import CONTENT_SUMMARY
-from onyx.document_index.vespa_constants import DOC_SUMMARY
 from onyx.document_index.vespa_constants import DOC_UPDATED_AT
 from onyx.document_index.vespa_constants import DOCUMENT_ID
 from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
@@ -128,8 +126,7 @@ def _vespa_hit_to_inference_chunk(
    return InferenceChunkUncleaned(
        chunk_id=fields[CHUNK_ID],
        blurb=fields.get(BLURB, ""),  # Unused
-        content=fields[CONTENT],  # Includes extra title prefix and metadata suffix;
-        # also sometimes context for contextual rag
+        content=fields[CONTENT],  # Includes extra title prefix and metadata suffix
        source_links=source_links_dict or {0: ""},
        section_continuation=fields[SECTION_CONTINUATION],
        document_id=fields[DOCUMENT_ID],
@@ -146,8 +143,6 @@ def _vespa_hit_to_inference_chunk(
        large_chunk_reference_ids=fields.get(LARGE_CHUNK_REFERENCE_IDS, []),
        metadata=metadata,
        metadata_suffix=fields.get(METADATA_SUFFIX),
-        doc_summary=fields.get(DOC_SUMMARY, ""),
-        chunk_context=fields.get(CHUNK_CONTEXT, ""),
        match_highlights=match_highlights,
        updated_at=updated_at,
    )
@@ -350,19 +345,6 @@ def query_vespa(
    filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]

    inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
-
-    try:
-        num_retrieved_inference_chunks = len(inference_chunks)
-        num_retrieved_document_ids = len(
-            set([chunk.document_id for chunk in inference_chunks])
-        )
-        logger.debug(
-            f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
-        )
-    except Exception as e:
-        # Debug logging only, should not fail the retrieval
-        logger.error(f"Error logging retrieval statistics: {e}")
-
    # Good Debugging Spot
    return inference_chunks

--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@@ -187,7 +187,7 @@ class VespaIndex(DocumentIndex):
    ) -> None:
        if MULTI_TENANT:
            logger.info(
-                "Skipping Vespa index setup for multitenant (would wipe all indices)"
+                "Skipping Vespa index seup for multitenant (would wipe all indices)"
            )
            return None

--- a/backend/onyx/document_index/vespa/indexing_utils.py
+++ b/backend/onyx/document_index/vespa/indexing_utils.py
@@ -25,11 +25,9 @@ from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
 from onyx.document_index.vespa_constants import AGGREGATED_CHUNK_BOOST_FACTOR
 from onyx.document_index.vespa_constants import BLURB
 from onyx.document_index.vespa_constants import BOOST
-from onyx.document_index.vespa_constants import CHUNK_CONTEXT
 from onyx.document_index.vespa_constants import CHUNK_ID
 from onyx.document_index.vespa_constants import CONTENT
 from onyx.document_index.vespa_constants import CONTENT_SUMMARY
-from onyx.document_index.vespa_constants import DOC_SUMMARY
 from onyx.document_index.vespa_constants import DOC_UPDATED_AT
 from onyx.document_index.vespa_constants import DOCUMENT_ID
 from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
@@ -176,7 +174,7 @@ def _index_vespa_chunk(
        # For the BM25 index, the keyword suffix is used, the vector is already generated with the more
        # natural language representation of the metadata section
        CONTENT: remove_invalid_unicode_chars(
-            f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_keyword}"
+            f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_keyword}"
        ),
        # This duplication of `content` is needed for keyword highlighting
        # Note that it's not exactly the same as the actual content
@@ -191,8 +189,6 @@ def _index_vespa_chunk(
        # Save as a list for efficient extraction as an Attribute
        METADATA_LIST: metadata_list,
        METADATA_SUFFIX: remove_invalid_unicode_chars(chunk.metadata_suffix_keyword),
-        CHUNK_CONTEXT: chunk.chunk_context,
-        DOC_SUMMARY: chunk.doc_summary,
        EMBEDDINGS: embeddings_name_vector_map,
        TITLE_EMBEDDING: chunk.title_embedding,
        DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
--- a/backend/onyx/document_index/vespa_constants.py
+++ b/backend/onyx/document_index/vespa_constants.py
@@ -71,8 +71,6 @@ LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
 METADATA = "metadata"
 METADATA_LIST = "metadata_list"
 METADATA_SUFFIX = "metadata_suffix"
-DOC_SUMMARY = "doc_summary"
-CHUNK_CONTEXT = "chunk_context"
 BOOST = "boost"
 AGGREGATED_CHUNK_BOOST_FACTOR = "aggregated_chunk_boost_factor"
 DOC_UPDATED_AT = "doc_updated_at"  # Indexed as seconds since epoch
@@ -108,8 +106,6 @@ YQL_BASE = (
    f"{LARGE_CHUNK_REFERENCE_IDS}, "
    f"{METADATA}, "
    f"{METADATA_SUFFIX}, "
-    f"{DOC_SUMMARY}, "
-    f"{CHUNK_CONTEXT}, "
    f"{CONTENT_SUMMARY} "
    f"from {{index_name}} where "
 )
--- a/backend/onyx/indexing/chunker.py
+++ b/backend/onyx/indexing/chunker.py
@@ -1,10 +1,7 @@
-from onyx.configs.app_configs import AVERAGE_SUMMARY_EMBEDDINGS
 from onyx.configs.app_configs import BLURB_SIZE
 from onyx.configs.app_configs import LARGE_CHUNK_RATIO
 from onyx.configs.app_configs import MINI_CHUNK_SIZE
 from onyx.configs.app_configs import SKIP_METADATA_IN_CHUNK
-from onyx.configs.app_configs import USE_CHUNK_SUMMARY
-from onyx.configs.app_configs import USE_DOCUMENT_SUMMARY
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import RETURN_SEPARATOR
 from onyx.configs.constants import SECTION_SEPARATOR
@@ -16,7 +13,6 @@ from onyx.connectors.models import IndexingDocument
 from onyx.connectors.models import Section
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.models import DocAwareChunk
-from onyx.llm.utils import MAX_CONTEXT_TOKENS
 from onyx.natural_language_processing.utils import BaseTokenizer
 from onyx.utils.logger import setup_logger
 from onyx.utils.text_processing import clean_text
@@ -86,9 +82,6 @@ def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwar
        large_chunk_reference_ids=[chunk.chunk_id for chunk in chunks],
        mini_chunk_texts=None,
        large_chunk_id=large_chunk_id,
-        chunk_context="",
-        doc_summary="",
-        contextual_rag_reserved_tokens=0,
    )

    offset = 0
@@ -127,7 +120,6 @@ class Chunker:
        tokenizer: BaseTokenizer,
        enable_multipass: bool = False,
        enable_large_chunks: bool = False,
-        enable_contextual_rag: bool = False,
        blurb_size: int = BLURB_SIZE,
        include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
        chunk_token_limit: int = DOC_EMBEDDING_CONTEXT_SIZE,
@@ -141,20 +133,9 @@ class Chunker:
        self.chunk_token_limit = chunk_token_limit
        self.enable_multipass = enable_multipass
        self.enable_large_chunks = enable_large_chunks
-        self.enable_contextual_rag = enable_contextual_rag
-        if enable_contextual_rag:
-            assert (
-                USE_CHUNK_SUMMARY or USE_DOCUMENT_SUMMARY
-            ), "Contextual RAG requires at least one of chunk summary and document summary enabled"
-        self.default_contextual_rag_reserved_tokens = MAX_CONTEXT_TOKENS * (
-            int(USE_CHUNK_SUMMARY) + int(USE_DOCUMENT_SUMMARY)
-        )
        self.tokenizer = tokenizer
        self.callback = callback

-        self.max_context = 0
-        self.prompt_tokens = 0
-
        self.blurb_splitter = SentenceSplitter(
            tokenizer=tokenizer.tokenize,
            chunk_size=blurb_size,
@@ -240,12 +221,30 @@ class Chunker:
            metadata_suffix_keyword=metadata_suffix_keyword,
            mini_chunk_texts=self._get_mini_chunk_texts(text),
            large_chunk_id=None,
-            doc_summary="",
-            chunk_context="",
-            contextual_rag_reserved_tokens=0,  # set per-document in _handle_single_document
        )
        chunks_list.append(new_chunk)

+    def _chunk_document(
+        self,
+        document: IndexingDocument,
+        title_prefix: str,
+        metadata_suffix_semantic: str,
+        metadata_suffix_keyword: str,
+        content_token_limit: int,
+    ) -> list[DocAwareChunk]:
+        """
+        Legacy method for backward compatibility.
+        Calls _chunk_document_with_sections with document.sections.
+        """
+        return self._chunk_document_with_sections(
+            document,
+            document.processed_sections,
+            title_prefix,
+            metadata_suffix_semantic,
+            metadata_suffix_keyword,
+            content_token_limit,
+        )
+
    def _chunk_document_with_sections(
        self,
        document: IndexingDocument,
@@ -265,7 +264,7 @@ class Chunker:

        for section_idx, section in enumerate(sections):
            # Get section text and other attributes
-            section_text = clean_text(str(section.text or ""))
+            section_text = clean_text(section.text or "")
            section_link_text = section.link or ""
            image_url = section.image_file_name

@@ -310,7 +309,7 @@ class Chunker:
                continue

            # CASE 2: Normal text section
-            section_token_count = len(self.tokenizer.encode(section_text))
+            section_token_count = len(self.tokenizer.tokenize(section_text))

            # If the section is large on its own, split it separately
            if section_token_count > content_token_limit:
@@ -333,7 +332,8 @@ class Chunker:
                    # If even the split_text is bigger than strict limit, further split
                    if (
                        STRICT_CHUNK_TOKEN_LIMIT
-                        and len(self.tokenizer.encode(split_text)) > content_token_limit
+                        and len(self.tokenizer.tokenize(split_text))
+                        > content_token_limit
                    ):
                        smaller_chunks = self._split_oversized_chunk(
                            split_text, content_token_limit
@@ -363,10 +363,10 @@ class Chunker:
                continue

            # If we can still fit this section into the current chunk, do so
-            current_token_count = len(self.tokenizer.encode(chunk_text))
+            current_token_count = len(self.tokenizer.tokenize(chunk_text))
            current_offset = len(shared_precompare_cleanup(chunk_text))
            next_section_tokens = (
-                len(self.tokenizer.encode(SECTION_SEPARATOR)) + section_token_count
+                len(self.tokenizer.tokenize(SECTION_SEPARATOR)) + section_token_count
            )

            if next_section_tokens + current_token_count <= content_token_limit:
@@ -414,7 +414,7 @@ class Chunker:
        # Title prep
        title = self._extract_blurb(document.get_title_for_document_index() or "")
        title_prefix = title + RETURN_SEPARATOR if title else ""
-        title_tokens = len(self.tokenizer.encode(title_prefix))
+        title_tokens = len(self.tokenizer.tokenize(title_prefix))

        # Metadata prep
        metadata_suffix_semantic = ""
@@ -427,50 +427,15 @@ class Chunker:
            ) = _get_metadata_suffix_for_document_index(
                document.metadata, include_separator=True
            )
-            metadata_tokens = len(self.tokenizer.encode(metadata_suffix_semantic))
+            metadata_tokens = len(self.tokenizer.tokenize(metadata_suffix_semantic))

        # If metadata is too large, skip it in the semantic content
        if metadata_tokens >= self.chunk_token_limit * MAX_METADATA_PERCENTAGE:
            metadata_suffix_semantic = ""
            metadata_tokens = 0

-        single_chunk_fits = True
-        doc_token_count = 0
-        if self.enable_contextual_rag:
-            doc_content = document.get_text_content()
-            tokenized_doc = self.tokenizer.tokenize(doc_content)
-            doc_token_count = len(tokenized_doc)
-
-            # check if doc + title + metadata fits in a single chunk. If so, no need for contextual RAG
-            single_chunk_fits = (
-                doc_token_count + title_tokens + metadata_tokens
-                <= self.chunk_token_limit
-            )
-
-        # expand the size of the context used for contextual rag based on whether chunk context and doc summary are used
-        context_size = 0
-        if (
-            self.enable_contextual_rag
-            and not single_chunk_fits
-            and not AVERAGE_SUMMARY_EMBEDDINGS
-        ):
-            context_size += self.default_contextual_rag_reserved_tokens
-
        # Adjust content token limit to accommodate title + metadata
-        content_token_limit = (
-            self.chunk_token_limit - title_tokens - metadata_tokens - context_size
-        )
-
-        # first check: if there is not enough actual chunk content when including contextual rag,
-        # then don't do contextual rag
-        if content_token_limit <= CHUNK_MIN_CONTENT:
-            context_size = 0  # Don't do contextual RAG
-            # revert to previous content token limit
-            content_token_limit = (
-                self.chunk_token_limit - title_tokens - metadata_tokens
-            )
-
-        # If there is not enough context remaining then just index the chunk with no prefix/suffix
+        content_token_limit = self.chunk_token_limit - title_tokens - metadata_tokens
        if content_token_limit <= CHUNK_MIN_CONTENT:
            # Not enough space left, so revert to full chunk without the prefix
            content_token_limit = self.chunk_token_limit
@@ -494,9 +459,6 @@ class Chunker:
            large_chunks = generate_large_chunks(normal_chunks)
            normal_chunks.extend(large_chunks)

-        for chunk in normal_chunks:
-            chunk.contextual_rag_reserved_tokens = context_size
-
        return normal_chunks

    def chunk(self, documents: list[IndexingDocument]) -> list[DocAwareChunk]:
--- a/backend/onyx/indexing/embedder.py
+++ b/backend/onyx/indexing/embedder.py
@@ -121,7 +121,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
            if chunk.large_chunk_reference_ids:
                large_chunks_present = True
            chunk_text = (
-                f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_semantic}"
+                f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_semantic}"
            ) or chunk.source_document.get_title_for_document_index()

            if not chunk_text:
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -1,4 +1,3 @@
-from collections import defaultdict
 from collections.abc import Callable
 from functools import partial
 from typing import Protocol
@@ -9,13 +8,7 @@ from sqlalchemy.orm import Session

 from onyx.access.access import get_access_for_documents
 from onyx.access.models import DocumentAccess
-from onyx.configs.app_configs import DEFAULT_CONTEXTUAL_RAG_LLM_NAME
-from onyx.configs.app_configs import DEFAULT_CONTEXTUAL_RAG_LLM_PROVIDER
-from onyx.configs.app_configs import ENABLE_CONTEXTUAL_RAG
 from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
-from onyx.configs.app_configs import MAX_TOKENS_FOR_FULL_INCLUSION
-from onyx.configs.app_configs import USE_CHUNK_SUMMARY
-from onyx.configs.app_configs import USE_DOCUMENT_SUMMARY
 from onyx.configs.constants import DEFAULT_BOOST
 from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
 from onyx.configs.model_configs import USE_INFORMATION_CONTENT_CLASSIFICATION
@@ -43,10 +36,9 @@ from onyx.db.document import upsert_documents
 from onyx.db.document_set import fetch_document_sets_for_documents
 from onyx.db.engine import get_session_with_current_tenant
 from onyx.db.models import Document as DBDocument
-from onyx.db.models import IndexModelStatus
 from onyx.db.pg_file_store import get_pgfilestore_by_file_name
 from onyx.db.pg_file_store import read_lobj
-from onyx.db.search_settings import get_active_search_settings
+from onyx.db.search_settings import get_current_search_settings
 from onyx.db.tag import create_or_add_document_tag
 from onyx.db.tag import create_or_add_document_tag_list
 from onyx.document_index.document_index_utils import (
@@ -65,24 +57,11 @@ from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexChunk
 from onyx.indexing.models import UpdatableChunkData
 from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
-from onyx.llm.chat_llm import LLMRateLimitError
 from onyx.llm.factory import get_default_llm_with_vision
-from onyx.llm.factory import get_llm_for_contextual_rag
-from onyx.llm.interfaces import LLM
-from onyx.llm.utils import get_max_input_tokens
-from onyx.llm.utils import MAX_CONTEXT_TOKENS
-from onyx.llm.utils import message_to_string
 from onyx.natural_language_processing.search_nlp_models import (
    InformationContentClassificationModel,
 )
-from onyx.natural_language_processing.utils import BaseTokenizer
-from onyx.natural_language_processing.utils import get_tokenizer
-from onyx.natural_language_processing.utils import tokenizer_trim_middle
-from onyx.prompts.chat_prompts import CONTEXTUAL_RAG_PROMPT1
-from onyx.prompts.chat_prompts import CONTEXTUAL_RAG_PROMPT2
-from onyx.prompts.chat_prompts import DOCUMENT_SUMMARY_PROMPT
 from onyx.utils.logger import setup_logger
-from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
 from onyx.utils.timing import log_function_time
 from shared_configs.configs import (
    INDEXING_INFORMATION_CONTENT_CLASSIFICATION_CUTOFF_LENGTH,
@@ -270,8 +249,6 @@ def index_doc_batch_with_handler(
    db_session: Session,
    tenant_id: str,
    ignore_time_skip: bool = False,
-    enable_contextual_rag: bool = False,
-    llm: LLM | None = None,
 ) -> IndexingPipelineResult:
    try:
        index_pipeline_result = index_doc_batch(
@@ -284,8 +261,6 @@ def index_doc_batch_with_handler(
            db_session=db_session,
            ignore_time_skip=ignore_time_skip,
            tenant_id=tenant_id,
-            enable_contextual_rag=enable_contextual_rag,
-            llm=llm,
        )
    except Exception as e:
        # don't log the batch directly, it's too much text
@@ -464,7 +439,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
                **document.dict(),
                processed_sections=[
                    Section(
-                        text=section.text if isinstance(section, TextSection) else "",
+                        text=section.text if isinstance(section, TextSection) else None,
                        link=section.link,
                        image_file_name=section.image_file_name
                        if isinstance(section, ImageSection)
@@ -484,11 +459,11 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
        for section in document.sections:
            # For ImageSection, process and create base Section with both text and image_file_name
            if isinstance(section, ImageSection):
-                # Default section with image path preserved - ensure text is always a string
+                # Default section with image path preserved
                processed_section = Section(
                    link=section.link,
                    image_file_name=section.image_file_name,
-                    text="",  # Initialize with empty string
+                    text=None,  # Will be populated if summarization succeeds
                )

                # Try to get image summary
@@ -531,21 +506,13 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
            # For TextSection, create a base Section with text and link
            elif isinstance(section, TextSection):
                processed_section = Section(
-                    text=section.text or "",  # Ensure text is always a string, not None
-                    link=section.link,
-                    image_file_name=None,
+                    text=section.text, link=section.link, image_file_name=None
                )
                processed_sections.append(processed_section)

-            # If it's already a base Section (unlikely), just append it with text validation
+            # If it's already a base Section (unlikely), just append it
            else:
-                # Ensure text is always a string
-                processed_section = Section(
-                    text=section.text if section.text is not None else "",
-                    link=section.link,
-                    image_file_name=section.image_file_name,
-                )
-                processed_sections.append(processed_section)
+                processed_sections.append(section)

        # Create IndexingDocument with original sections and processed_sections
        indexed_document = IndexingDocument(
@@ -556,145 +523,6 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
    return indexed_documents


-def add_document_summaries(
-    chunks_by_doc: list[DocAwareChunk],
-    llm: LLM,
-    tokenizer: BaseTokenizer,
-    trunc_doc_tokens: int,
-) -> list[int] | None:
-    """
-    Adds a document summary to a list of chunks from the same document.
-    Returns the number of tokens in the document.
-    """
-
-    doc_tokens = []
-    # this is value is the same for each chunk in the document; 0 indicates
-    # There is not enough space for contextual RAG (the chunk content
-    # and possibly metadata took up too much space)
-    if chunks_by_doc[0].contextual_rag_reserved_tokens == 0:
-        return None
-
-    doc_tokens = tokenizer.encode(chunks_by_doc[0].source_document.get_text_content())
-    doc_content = tokenizer_trim_middle(doc_tokens, trunc_doc_tokens, tokenizer)
-    summary_prompt = DOCUMENT_SUMMARY_PROMPT.format(document=doc_content)
-    doc_summary = message_to_string(
-        llm.invoke(summary_prompt, max_tokens=MAX_CONTEXT_TOKENS)
-    )
-
-    for chunk in chunks_by_doc:
-        chunk.doc_summary = doc_summary
-
-    return doc_tokens
-
-
-def add_chunk_summaries(
-    chunks_by_doc: list[DocAwareChunk],
-    llm: LLM,
-    tokenizer: BaseTokenizer,
-    trunc_doc_chunk_tokens: int,
-    doc_tokens: list[int] | None,
-) -> None:
-    """
-    Adds chunk summaries to the chunks grouped by document id.
-    Chunk summaries look at the chunk as well as the entire document (or a summary,
-    if the document is too long) and describe how the chunk relates to the document.
-    """
-    # all chunks within a document have the same contextual_rag_reserved_tokens
-    if chunks_by_doc[0].contextual_rag_reserved_tokens == 0:
-        return
-
-    # use values computed in above doc summary section if available
-    doc_tokens = doc_tokens or tokenizer.encode(
-        chunks_by_doc[0].source_document.get_text_content()
-    )
-    doc_content = tokenizer_trim_middle(doc_tokens, trunc_doc_chunk_tokens, tokenizer)
-
-    # only compute doc summary if needed
-    doc_info = (
-        doc_content
-        if len(doc_tokens) <= MAX_TOKENS_FOR_FULL_INCLUSION
-        else chunks_by_doc[0].doc_summary
-    )
-    if not doc_info:
-        # This happens if the document is too long AND document summaries are turned off
-        # In this case we compute a doc summary using the LLM
-        doc_info = message_to_string(
-            llm.invoke(
-                DOCUMENT_SUMMARY_PROMPT.format(document=doc_content),
-                max_tokens=MAX_CONTEXT_TOKENS,
-            )
-        )
-
-    context_prompt1 = CONTEXTUAL_RAG_PROMPT1.format(document=doc_info)
-
-    def assign_context(chunk: DocAwareChunk) -> None:
-        context_prompt2 = CONTEXTUAL_RAG_PROMPT2.format(chunk=chunk.content)
-        try:
-            chunk.chunk_context = message_to_string(
-                llm.invoke(
-                    context_prompt1 + context_prompt2,
-                    max_tokens=MAX_CONTEXT_TOKENS,
-                )
-            )
-        except LLMRateLimitError as e:
-            # Erroring during chunker is undesirable, so we log the error and continue
-            # TODO: for v2, add robust retry logic
-            logger.exception(f"Rate limit adding chunk summary: {e}", exc_info=e)
-            chunk.chunk_context = ""
-        except Exception as e:
-            logger.exception(f"Error adding chunk summary: {e}", exc_info=e)
-            chunk.chunk_context = ""
-
-    run_functions_tuples_in_parallel(
-        [(assign_context, (chunk,)) for chunk in chunks_by_doc]
-    )
-
-
-def add_contextual_summaries(
-    chunks: list[DocAwareChunk],
-    llm: LLM,
-    tokenizer: BaseTokenizer,
-    chunk_token_limit: int,
-) -> list[DocAwareChunk]:
-    """
-    Adds Document summary and chunk-within-document context to the chunks
-    based on which environment variables are set.
-    """
-    max_context = get_max_input_tokens(
-        model_name=llm.config.model_name,
-        model_provider=llm.config.model_provider,
-        output_tokens=MAX_CONTEXT_TOKENS,
-    )
-    doc2chunks = defaultdict(list)
-    for chunk in chunks:
-        doc2chunks[chunk.source_document.id].append(chunk)
-
-    # The number of tokens allowed for the document when computing a document summary
-    trunc_doc_summary_tokens = max_context - len(
-        tokenizer.encode(DOCUMENT_SUMMARY_PROMPT)
-    )
-
-    prompt_tokens = len(
-        tokenizer.encode(CONTEXTUAL_RAG_PROMPT1 + CONTEXTUAL_RAG_PROMPT2)
-    )
-    # The number of tokens allowed for the document when computing a
-    # "chunk in context of document" summary
-    trunc_doc_chunk_tokens = max_context - prompt_tokens - chunk_token_limit
-    for chunks_by_doc in doc2chunks.values():
-        doc_tokens = None
-        if USE_DOCUMENT_SUMMARY:
-            doc_tokens = add_document_summaries(
-                chunks_by_doc, llm, tokenizer, trunc_doc_summary_tokens
-            )
-
-        if USE_CHUNK_SUMMARY:
-            add_chunk_summaries(
-                chunks_by_doc, llm, tokenizer, trunc_doc_chunk_tokens, doc_tokens
-            )
-
-    return chunks
-
-
@log_function_time(debug_only=True)
 def index_doc_batch(
    *,
@@ -706,8 +534,6 @@ def index_doc_batch(
    index_attempt_metadata: IndexAttemptMetadata,
    db_session: Session,
    tenant_id: str,
-    enable_contextual_rag: bool = False,
-    llm: LLM | None = None,
    ignore_time_skip: bool = False,
    filter_fnc: Callable[[list[Document]], list[Document]] = filter_documents,
 ) -> IndexingPipelineResult:
@@ -770,20 +596,6 @@ def index_doc_batch(
    # a common source of failure for the indexing pipeline
    chunks: list[DocAwareChunk] = chunker.chunk(ctx.indexable_docs)

-    # contextual RAG
-    if enable_contextual_rag:
-        assert llm is not None, "must provide an LLM for contextual RAG"
-        llm_tokenizer = get_tokenizer(
-            model_name=llm.config.model_name,
-            provider_type=llm.config.model_provider,
-        )
-
-        # Because the chunker's tokens are different from the LLM's tokens,
-        # We add a fudge factor to ensure we truncate prompts to the LLM's token limit
-        chunks = add_contextual_summaries(
-            chunks, llm, llm_tokenizer, chunker.chunk_token_limit * 2
-        )
-
    logger.debug("Starting embedding")
    chunks_with_embeddings, embedding_failures = (
        embed_chunks_with_failure_handling(
@@ -979,33 +791,13 @@ def build_indexing_pipeline(
    callback: IndexingHeartbeatInterface | None = None,
 ) -> IndexingPipelineProtocol:
    """Builds a pipeline which takes in a list (batch) of docs and indexes them."""
-    all_search_settings = get_active_search_settings(db_session)
-    if (
-        all_search_settings.secondary
-        and all_search_settings.secondary.status == IndexModelStatus.FUTURE
-    ):
-        search_settings = all_search_settings.secondary
-    else:
-        search_settings = all_search_settings.primary
-
+    search_settings = get_current_search_settings(db_session)
    multipass_config = get_multipass_config(search_settings)

-    enable_contextual_rag = (
-        search_settings.enable_contextual_rag or ENABLE_CONTEXTUAL_RAG
-    )
-    llm = None
-    if enable_contextual_rag:
-        llm = get_llm_for_contextual_rag(
-            search_settings.contextual_rag_llm_name or DEFAULT_CONTEXTUAL_RAG_LLM_NAME,
-            search_settings.contextual_rag_llm_provider
-            or DEFAULT_CONTEXTUAL_RAG_LLM_PROVIDER,
-        )
-
    chunker = chunker or Chunker(
        tokenizer=embedder.embedding_model.tokenizer,
        enable_multipass=multipass_config.multipass_indexing,
        enable_large_chunks=multipass_config.enable_large_chunks,
-        enable_contextual_rag=enable_contextual_rag,
        # after every doc, update status in case there are a bunch of really long docs
        callback=callback,
    )
@@ -1019,6 +811,4 @@ def build_indexing_pipeline(
        ignore_time_skip=ignore_time_skip,
        db_session=db_session,
        tenant_id=tenant_id,
-        enable_contextual_rag=enable_contextual_rag,
-        llm=llm,
    )
--- a/backend/onyx/indexing/models.py
+++ b/backend/onyx/indexing/models.py
@@ -49,15 +49,6 @@ class DocAwareChunk(BaseChunk):
    metadata_suffix_semantic: str
    metadata_suffix_keyword: str

-    # This is the number of tokens reserved for contextual RAG
-    # in the chunk. doc_summary and chunk_context conbined should
-    # contain at most this many tokens.
-    contextual_rag_reserved_tokens: int
-    # This is the summary for the document generated for contextual RAG
-    doc_summary: str
-    # This is the context for this chunk generated for contextual RAG
-    chunk_context: str
-
    mini_chunk_texts: list[str] | None

    large_chunk_id: int | None
@@ -163,9 +154,6 @@ class IndexingSetting(EmbeddingModelDetail):
    reduced_dimension: int | None = None

    background_reindex_enabled: bool = True
-    enable_contextual_rag: bool
-    contextual_rag_llm_name: str | None = None
-    contextual_rag_llm_provider: str | None = None

    # This disables the "model_" protected namespace for pydantic
    model_config = {"protected_namespaces": ()}
@@ -190,7 +178,6 @@ class IndexingSetting(EmbeddingModelDetail):
            embedding_precision=search_settings.embedding_precision,
            reduced_dimension=search_settings.reduced_dimension,
            background_reindex_enabled=search_settings.background_reindex_enabled,
-            enable_contextual_rag=search_settings.enable_contextual_rag,
        )


--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -425,12 +425,12 @@ class DefaultMultiLLM(LLM):
                messages=processed_prompt,
                tools=tools,
                tool_choice=tool_choice if tools else None,
-                max_tokens=max_tokens,
                # streaming choice
                stream=stream,
                # model params
                temperature=0,
                timeout=timeout_override or self._timeout,
+                max_tokens=max_tokens,
                # For now, we don't support parallel tool calls
                # NOTE: we can't pass this in if tools are not specified
                # or else OpenAI throws an error
@@ -531,7 +531,6 @@ class DefaultMultiLLM(LLM):
                tool_choice,
                structured_response_format,
                timeout_override,
-                max_tokens,
            )
            return

--- a/backend/onyx/llm/factory.py
+++ b/backend/onyx/llm/factory.py
@@ -16,7 +16,6 @@ from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.interfaces import LLM
 from onyx.llm.override_models import LLMOverride
 from onyx.llm.utils import model_supports_image_input
-from onyx.server.manage.llm.models import LLMProvider
 from onyx.server.manage.llm.models import LLMProviderView
 from onyx.utils.headers import build_llm_extra_headers
 from onyx.utils.logger import setup_logger
@@ -155,40 +154,6 @@ def get_default_llm_with_vision(
    return None


-def llm_from_provider(
-    model_name: str,
-    llm_provider: LLMProvider,
-    timeout: int | None = None,
-    temperature: float | None = None,
-    additional_headers: dict[str, str] | None = None,
-    long_term_logger: LongTermLogger | None = None,
-) -> LLM:
-    return get_llm(
-        provider=llm_provider.provider,
-        model=model_name,
-        deployment_name=llm_provider.deployment_name,
-        api_key=llm_provider.api_key,
-        api_base=llm_provider.api_base,
-        api_version=llm_provider.api_version,
-        custom_config=llm_provider.custom_config,
-        timeout=timeout,
-        temperature=temperature,
-        additional_headers=additional_headers,
-        long_term_logger=long_term_logger,
-    )
-
-
-def get_llm_for_contextual_rag(model_name: str, model_provider: str) -> LLM:
-    with get_session_context_manager() as db_session:
-        llm_provider = fetch_llm_provider_view(db_session, model_provider)
-    if not llm_provider:
-        raise ValueError("No LLM provider with name {} found".format(model_provider))
-    return llm_from_provider(
-        model_name=model_name,
-        llm_provider=llm_provider,
-    )
-
-
 def get_default_llms(
    timeout: int | None = None,
    temperature: float | None = None,
@@ -214,9 +179,14 @@ def get_default_llms(
        raise ValueError("No fast default model name found")

    def _create_llm(model: str) -> LLM:
-        return llm_from_provider(
-            model_name=model,
-            llm_provider=llm_provider,
+        return get_llm(
+            provider=llm_provider.provider,
+            model=model,
+            deployment_name=llm_provider.deployment_name,
+            api_key=llm_provider.api_key,
+            api_base=llm_provider.api_base,
+            api_version=llm_provider.api_version,
+            custom_config=llm_provider.custom_config,
            timeout=timeout,
            temperature=temperature,
            additional_headers=additional_headers,
--- a/backend/onyx/llm/utils.py
+++ b/backend/onyx/llm/utils.py
@@ -29,19 +29,13 @@ from litellm.exceptions import Timeout  # type: ignore
 from litellm.exceptions import UnprocessableEntityError  # type: ignore

 from onyx.configs.app_configs import LITELLM_CUSTOM_ERROR_MESSAGE_MAPPINGS
-from onyx.configs.app_configs import MAX_TOKENS_FOR_FULL_INCLUSION
-from onyx.configs.app_configs import USE_CHUNK_SUMMARY
-from onyx.configs.app_configs import USE_DOCUMENT_SUMMARY
 from onyx.configs.constants import MessageType
-from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
 from onyx.configs.model_configs import GEN_AI_MAX_TOKENS
 from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
 from onyx.configs.model_configs import GEN_AI_NUM_RESERVED_OUTPUT_TOKENS
 from onyx.file_store.models import ChatFileType
 from onyx.file_store.models import InMemoryChatFile
 from onyx.llm.interfaces import LLM
-from onyx.prompts.chat_prompts import CONTEXTUAL_RAG_TOKEN_ESTIMATE
-from onyx.prompts.chat_prompts import DOCUMENT_SUMMARY_TOKEN_ESTIMATE
 from onyx.prompts.constants import CODE_BLOCK_PAT
 from onyx.utils.b64 import get_image_type
 from onyx.utils.b64 import get_image_type_from_bytes
@@ -50,10 +44,6 @@ from shared_configs.configs import LOG_LEVEL

 logger = setup_logger()

-MAX_CONTEXT_TOKENS = 100
-ONE_MILLION = 1_000_000
-CHUNKS_PER_DOC_ESTIMATE = 5
-

 def litellm_exception_to_error_msg(
    e: Exception,
@@ -426,72 +416,6 @@ def _find_model_obj(model_map: dict, provider: str, model_name: str) -> dict | N
    return None


-def get_llm_contextual_cost(
-    llm: LLM,
-) -> float:
-    """
-    Approximate the cost of using the given LLM for indexing with Contextual RAG.
-
-    We use a precomputed estimate for the number of tokens in the contextualizing prompts,
-    and we assume that every chunk is maximized in terms of content and context.
-    We also assume that every document is maximized in terms of content, as currently if
-    a document is longer than a certain length, its summary is used instead of the full content.
-
-    We expect that the first assumption will overestimate more than the second one
-    underestimates, so this should be a fairly conservative price estimate. Also,
-    this does not account for the cost of documents that fit within a single chunk
-    which do not get contextualized.
-    """
-
-    # calculate input costs
-    num_tokens = ONE_MILLION
-    num_input_chunks = num_tokens // DOC_EMBEDDING_CONTEXT_SIZE
-
-    # We assume that the documents are MAX_TOKENS_FOR_FULL_INCLUSION tokens long
-    # on average.
-    num_docs = num_tokens // MAX_TOKENS_FOR_FULL_INCLUSION
-
-    num_input_tokens = 0
-    num_output_tokens = 0
-
-    if not USE_CHUNK_SUMMARY and not USE_DOCUMENT_SUMMARY:
-        return 0
-
-    if USE_CHUNK_SUMMARY:
-        # Each per-chunk prompt includes:
-        # - The prompt tokens
-        # - the document tokens
-        # - the chunk tokens
-
-        # for each chunk, we prompt the LLM with the contextual RAG prompt
-        # and the full document content (or the doc summary, so this is an overestimate)
-        num_input_tokens += num_input_chunks * (
-            CONTEXTUAL_RAG_TOKEN_ESTIMATE + MAX_TOKENS_FOR_FULL_INCLUSION
-        )
-
-        # in aggregate, each chunk content is used as a prompt input once
-        # so the full input size is covered
-        num_input_tokens += num_tokens
-
-        # A single MAX_CONTEXT_TOKENS worth of output is generated per chunk
-        num_output_tokens += num_input_chunks * MAX_CONTEXT_TOKENS
-
-    # going over each doc once means all the tokens, plus the prompt tokens for
-    # the summary prompt. This CAN happen even when USE_DOCUMENT_SUMMARY is false,
-    # since doc summaries are used for longer documents when USE_CHUNK_SUMMARY is true.
-    # So, we include this unconditionally to overestimate.
-    num_input_tokens += num_tokens + num_docs * DOCUMENT_SUMMARY_TOKEN_ESTIMATE
-    num_output_tokens += num_docs * MAX_CONTEXT_TOKENS
-
-    usd_per_prompt, usd_per_completion = litellm.cost_per_token(
-        model=llm.config.model_name,
-        prompt_tokens=num_input_tokens,
-        completion_tokens=num_output_tokens,
-    )
-    # Costs are in USD dollars per million tokens
-    return usd_per_prompt + usd_per_completion
-
-
 def get_llm_max_tokens(
    model_map: dict,
    model_name: str,
--- a/backend/onyx/main.py
+++ b/backend/onyx/main.py
@@ -391,11 +391,6 @@ def get_application() -> FastAPI:
            prefix="/auth",
        )

-    if (
-        AUTH_TYPE == AuthType.CLOUD
-        or AUTH_TYPE == AuthType.BASIC
-        or AUTH_TYPE == AuthType.GOOGLE_OAUTH
-    ):
        # Add refresh token endpoint for OAuth as well
        include_auth_router_with_prefix(
            application,
--- a/backend/onyx/natural_language_processing/utils.py
+++ b/backend/onyx/natural_language_processing/utils.py
@@ -3,8 +3,6 @@ from abc import ABC
 from abc import abstractmethod
 from copy import copy

-from tokenizers import Encoding  # type: ignore
-from tokenizers import Tokenizer  # type: ignore
 from transformers import logging as transformer_logging  # type:ignore

 from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
@@ -13,8 +11,6 @@ from onyx.context.search.models import InferenceChunk
 from onyx.utils.logger import setup_logger
 from shared_configs.enums import EmbeddingProvider

-TRIM_SEP_PAT = "\n... {n} tokens removed...\n"
-
 logger = setup_logger()
 transformer_logging.set_verbosity_error()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -71,27 +67,16 @@ class TiktokenTokenizer(BaseTokenizer):

 class HuggingFaceTokenizer(BaseTokenizer):
    def __init__(self, model_name: str):
-        self.encoder: Tokenizer = Tokenizer.from_pretrained(model_name)
+        from tokenizers import Tokenizer  # type: ignore

-    def _safer_encode(self, string: str) -> Encoding:
-        """
-        Encode a string using the HuggingFaceTokenizer, but if it fails,
-        encode the string as ASCII and decode it back to a string. This helps
-        in cases where the string has weird characters like \udeb4.
-        """
-        try:
-            return self.encoder.encode(string, add_special_tokens=False)
-        except Exception:
-            return self.encoder.encode(
-                string.encode("ascii", "ignore").decode(), add_special_tokens=False
-            )
+        self.encoder = Tokenizer.from_pretrained(model_name)

    def encode(self, string: str) -> list[int]:
        # this returns no special tokens
-        return self._safer_encode(string).ids
+        return self.encoder.encode(string, add_special_tokens=False).ids

    def tokenize(self, string: str) -> list[str]:
-        return self._safer_encode(string).tokens
+        return self.encoder.encode(string, add_special_tokens=False).tokens

    def decode(self, tokens: list[int]) -> str:
        return self.encoder.decode(tokens)
@@ -174,26 +159,9 @@ def tokenizer_trim_content(
    content: str, desired_length: int, tokenizer: BaseTokenizer
 ) -> str:
    tokens = tokenizer.encode(content)
-    if len(tokens) <= desired_length:
-        return content
-
-    return tokenizer.decode(tokens[:desired_length])
-
-
-def tokenizer_trim_middle(
-    tokens: list[int], desired_length: int, tokenizer: BaseTokenizer
-) -> str:
-    if len(tokens) <= desired_length:
-        return tokenizer.decode(tokens)
-    sep_str = TRIM_SEP_PAT.format(n=len(tokens) - desired_length)
-    sep_tokens = tokenizer.encode(sep_str)
-    slice_size = (desired_length - len(sep_tokens)) // 2
-    assert slice_size > 0, "Slice size is not positive, desired length is too short"
-    return (
-        tokenizer.decode(tokens[:slice_size])
-        + sep_str
-        + tokenizer.decode(tokens[-slice_size:])
-    )
+    if len(tokens) > desired_length:
+        content = tokenizer.decode(tokens[:desired_length])
+    return content


 def tokenizer_trim_chunks(
--- a/backend/onyx/prompts/chat_prompts.py
+++ b/backend/onyx/prompts/chat_prompts.py
@@ -220,29 +220,3 @@ Chat History:

 Based on the above, what is a short name to convey the topic of the conversation?
 """.strip()
-
-# NOTE: the prompt separation is partially done for efficiency; previously I tried
-# to do it all in one prompt with sequential format() calls but this will cause a backend
-# error when the document contains any {} as python will expect the {} to be filled by
-# format() arguments
-CONTEXTUAL_RAG_PROMPT1 = """<document>
-{document}
-</document>
-Here is the chunk we want to situate within the whole document"""
-
-CONTEXTUAL_RAG_PROMPT2 = """<chunk>
-{chunk}
-</chunk>
-Please give a short succinct context to situate this chunk within the overall document
-for the purposes of improving search retrieval of the chunk. Answer only with the succinct
-context and nothing else. """
-
-CONTEXTUAL_RAG_TOKEN_ESTIMATE = 64  # 19 + 45
-
-DOCUMENT_SUMMARY_PROMPT = """<document>
-{document}
-</document>
-Please give a short succinct summary of the entire document. Answer only with the succinct
-summary and nothing else. """
-
-DOCUMENT_SUMMARY_TOKEN_ESTIMATE = 29
--- a/backend/onyx/seeding/load_docs.py
+++ b/backend/onyx/seeding/load_docs.py
@@ -87,9 +87,6 @@ def _create_indexable_chunks(
            metadata_suffix_keyword="",
            mini_chunk_texts=None,
            large_chunk_reference_ids=[],
-            doc_summary="",
-            chunk_context="",
-            contextual_rag_reserved_tokens=0,
            embeddings=ChunkEmbedding(
                full_embedding=preprocessed_doc["content_embedding"],
                mini_chunk_embeddings=[],
--- a/backend/onyx/server/manage/llm/api.py
+++ b/backend/onyx/server/manage/llm/api.py
@@ -21,11 +21,9 @@ from onyx.llm.factory import get_default_llms
 from onyx.llm.factory import get_llm
 from onyx.llm.llm_provider_options import fetch_available_well_known_llms
 from onyx.llm.llm_provider_options import WellKnownLLMProviderDescriptor
-from onyx.llm.utils import get_llm_contextual_cost
 from onyx.llm.utils import litellm_exception_to_error_msg
 from onyx.llm.utils import model_supports_image_input
 from onyx.llm.utils import test_llm
-from onyx.server.manage.llm.models import LLMCost
 from onyx.server.manage.llm.models import LLMProviderDescriptor
 from onyx.server.manage.llm.models import LLMProviderUpsertRequest
 from onyx.server.manage.llm.models import LLMProviderView
@@ -288,38 +286,3 @@ def list_llm_provider_basics(
            db_session, user
        )
    ]
-
-
-@admin_router.get("/provider-contextual-cost")
-def get_provider_contextual_cost(
-    _: User | None = Depends(current_admin_user),
-    db_session: Session = Depends(get_session),
-) -> list[LLMCost]:
-    """
-    Get the cost of Re-indexing all documents for contextual retrieval.
-
-    See https://docs.litellm.ai/docs/completion/token_usage#5-cost_per_token
-    This includes:
-    - The cost of invoking the LLM on each chunk-document pair to get
-      - the doc_summary
-      - the chunk_context
-    - The per-token cost of the LLM used to generate the doc_summary and chunk_context
-    """
-    providers = fetch_existing_llm_providers(db_session)
-    costs = []
-    for provider in providers:
-        for model_name in provider.display_model_names or provider.model_names or []:
-            llm = get_llm(
-                provider=provider.provider,
-                model=model_name,
-                deployment_name=provider.deployment_name,
-                api_key=provider.api_key,
-                api_base=provider.api_base,
-                api_version=provider.api_version,
-                custom_config=provider.custom_config,
-            )
-            cost = get_llm_contextual_cost(llm)
-            costs.append(
-                LLMCost(provider=provider.name, model_name=model_name, cost=cost)
-            )
-    return costs
--- a/backend/onyx/server/manage/llm/models.py
+++ b/backend/onyx/server/manage/llm/models.py
@@ -119,9 +119,3 @@ class VisionProviderResponse(LLMProviderView):
    """Response model for vision providers endpoint, including vision-specific fields."""

    vision_models: list[str]
-
-
-class LLMCost(BaseModel):
-    provider: str
-    model_name: str
-    cost: float
--- a/backend/scripts/query_time_check/seed_dummy_docs.py
+++ b/backend/scripts/query_time_check/seed_dummy_docs.py
@@ -63,10 +63,7 @@ def generate_dummy_chunk(
        title_prefix=f"Title prefix for doc {doc_id}",
        metadata_suffix_semantic="",
        metadata_suffix_keyword="",
-        doc_summary="",
-        chunk_context="",
        mini_chunk_texts=None,
-        contextual_rag_reserved_tokens=0,
        embeddings=ChunkEmbedding(
            full_embedding=generate_random_embedding(embedding_dim),
            mini_chunk_embeddings=[],
--- a/backend/shared_configs/configs.py
+++ b/backend/shared_configs/configs.py
@@ -99,7 +99,6 @@ PRESERVED_SEARCH_FIELDS = [
    "api_url",
    "index_name",
    "multipass_indexing",
-    "enable_contextual_rag",
    "model_dim",
    "normalize",
    "passage_prefix",
--- a/backend/tests/unit/ee/onyx/external_permissions/salesforce/test_postprocessing.py
+++ b/backend/tests/unit/ee/onyx/external_permissions/salesforce/test_postprocessing.py
@@ -32,8 +32,6 @@ def create_test_chunk(
        match_highlights=[],
        updated_at=datetime.now(),
        image_file_name=None,
-        doc_summary="",
-        chunk_context="",
    )


--- a/backend/tests/unit/onyx/chat/conftest.py
+++ b/backend/tests/unit/onyx/chat/conftest.py
@@ -78,8 +78,6 @@ def mock_inference_sections() -> list[InferenceSection]:
                source_links={0: "https://example.com/doc1"},
                match_highlights=[],
                image_file_name=None,
-                doc_summary="",
-                chunk_context="",
            ),
            chunks=MagicMock(),
        ),
@@ -103,8 +101,6 @@ def mock_inference_sections() -> list[InferenceSection]:
                source_links={0: "https://example.com/doc2"},
                match_highlights=[],
                image_file_name=None,
-                doc_summary="",
-                chunk_context="",
            ),
            chunks=MagicMock(),
        ),
--- a/backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py
+++ b/backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py
@@ -151,8 +151,6 @@ def test_fuzzy_match_quotes_to_docs() -> None:
        match_highlights=[],
        updated_at=None,
        image_file_name=None,
-        doc_summary="",
-        chunk_context="",
    )
    test_chunk_1 = InferenceChunk(
        document_id="test doc 1",
@@ -172,8 +170,6 @@ def test_fuzzy_match_quotes_to_docs() -> None:
        match_highlights=[],
        updated_at=None,
        image_file_name=None,
-        doc_summary="",
-        chunk_context="",
    )

    test_quotes = [
--- a/backend/tests/unit/onyx/chat/test_prune_and_merge.py
+++ b/backend/tests/unit/onyx/chat/test_prune_and_merge.py
@@ -38,8 +38,6 @@ def create_inference_chunk(
        match_highlights=[],
        updated_at=None,
        image_file_name=None,
-        doc_summary="",
-        chunk_context="",
    )


--- a/backend/tests/unit/onyx/indexing/conftest.py
+++ b/backend/tests/unit/onyx/indexing/conftest.py
@@ -1,6 +1,5 @@
 import pytest

-from onyx.indexing.embedder import DefaultIndexingEmbedder
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface


@@ -18,13 +17,3 @@ class MockHeartbeat(IndexingHeartbeatInterface):
@pytest.fixture
 def mock_heartbeat() -> MockHeartbeat:
    return MockHeartbeat()
-
-
-@pytest.fixture
-def embedder() -> DefaultIndexingEmbedder:
-    return DefaultIndexingEmbedder(
-        model_name="intfloat/e5-base-v2",
-        normalize=True,
-        query_prefix=None,
-        passage_prefix=None,
-    )
--- a/backend/tests/unit/onyx/indexing/test_chunker.py
+++ b/backend/tests/unit/onyx/indexing/test_chunker.py
@@ -1,24 +1,25 @@
-from typing import Any
-from unittest.mock import Mock
-
 import pytest

-from onyx.configs.app_configs import USE_CHUNK_SUMMARY
-from onyx.configs.app_configs import USE_DOCUMENT_SUMMARY
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.models import Document
 from onyx.connectors.models import TextSection
 from onyx.indexing.chunker import Chunker
 from onyx.indexing.embedder import DefaultIndexingEmbedder
 from onyx.indexing.indexing_pipeline import process_image_sections
-from onyx.llm.utils import MAX_CONTEXT_TOKENS
 from tests.unit.onyx.indexing.conftest import MockHeartbeat


-@pytest.mark.parametrize("enable_contextual_rag", [True, False])
-def test_chunk_document(
-    embedder: DefaultIndexingEmbedder, enable_contextual_rag: bool
-) -> None:
+@pytest.fixture
+def embedder() -> DefaultIndexingEmbedder:
+    return DefaultIndexingEmbedder(
+        model_name="intfloat/e5-base-v2",
+        normalize=True,
+        query_prefix=None,
+        passage_prefix=None,
+    )
+
+
+def test_chunk_document(embedder: DefaultIndexingEmbedder) -> None:
    short_section_1 = "This is a short section."
    long_section = (
        "This is a long section that should be split into multiple chunks. " * 100
@@ -44,22 +45,9 @@ def test_chunk_document(
    )
    indexing_documents = process_image_sections([document])

-    mock_llm_invoke_count = 0
-
-    def mock_llm_invoke(self: Any, *args: Any, **kwargs: Any) -> Mock:
-        nonlocal mock_llm_invoke_count
-        mock_llm_invoke_count += 1
-        m = Mock()
-        m.content = f"Test{mock_llm_invoke_count}"
-        return m
-
-    mock_llm = Mock()
-    mock_llm.invoke = mock_llm_invoke
-
    chunker = Chunker(
        tokenizer=embedder.embedding_model.tokenizer,
        enable_multipass=False,
-        enable_contextual_rag=enable_contextual_rag,
    )
    chunks = chunker.chunk(indexing_documents)

@@ -70,14 +58,6 @@ def test_chunk_document(
    assert "tag1" in chunks[0].metadata_suffix_keyword
    assert "tag2" in chunks[0].metadata_suffix_semantic

-    rag_tokens = MAX_CONTEXT_TOKENS * (
-        int(USE_DOCUMENT_SUMMARY) + int(USE_CHUNK_SUMMARY)
-    )
-    for chunk in chunks:
-        assert chunk.contextual_rag_reserved_tokens == (
-            rag_tokens if enable_contextual_rag else 0
-        )
-

 def test_chunker_heartbeat(
    embedder: DefaultIndexingEmbedder, mock_heartbeat: MockHeartbeat
@@ -98,7 +78,6 @@ def test_chunker_heartbeat(
        tokenizer=embedder.embedding_model.tokenizer,
        enable_multipass=False,
        callback=mock_heartbeat,
-        enable_contextual_rag=False,
    )

    chunks = chunker.chunk(indexing_documents)
--- a/backend/tests/unit/onyx/indexing/test_embedder.py
+++ b/backend/tests/unit/onyx/indexing/test_embedder.py
@@ -21,13 +21,7 @@ def mock_embedding_model() -> Generator[Mock, None, None]:
        yield mock


-@pytest.mark.parametrize(
-    "chunk_context, doc_summary",
-    [("Test chunk context", "Test document summary"), ("", "")],
-)
-def test_default_indexing_embedder_embed_chunks(
-    mock_embedding_model: Mock, chunk_context: str, doc_summary: str
-) -> None:
+def test_default_indexing_embedder_embed_chunks(mock_embedding_model: Mock) -> None:
    # Setup
    embedder = DefaultIndexingEmbedder(
        model_name="test-model",
@@ -69,9 +63,6 @@ def test_default_indexing_embedder_embed_chunks(
            large_chunk_reference_ids=[],
            large_chunk_id=None,
            image_file_name=None,
-            chunk_context=chunk_context,
-            doc_summary=doc_summary,
-            contextual_rag_reserved_tokens=200,
        )
    ]

@@ -90,7 +81,7 @@ def test_default_indexing_embedder_embed_chunks(

    # Verify the embedding model was called correctly
    mock_embedding_model.return_value.encode.assert_any_call(
-        texts=[f"Title: {doc_summary}Test chunk{chunk_context}"],
+        texts=["Title: Test chunk"],
        text_type=EmbedTextType.PASSAGE,
        large_chunks_present=False,
    )
--- a/backend/tests/unit/onyx/indexing/test_indexing_pipeline.py
+++ b/backend/tests/unit/onyx/indexing/test_indexing_pipeline.py
@@ -1,8 +1,6 @@
-from typing import Any
 from typing import cast
 from typing import List
 from unittest.mock import Mock
-from unittest.mock import patch

 import pytest

@@ -11,12 +9,8 @@ from onyx.connectors.models import Document
 from onyx.connectors.models import DocumentSource
 from onyx.connectors.models import ImageSection
 from onyx.connectors.models import TextSection
-from onyx.indexing.chunker import Chunker
-from onyx.indexing.embedder import DefaultIndexingEmbedder
 from onyx.indexing.indexing_pipeline import _get_aggregated_chunk_boost_factor
-from onyx.indexing.indexing_pipeline import add_contextual_summaries
 from onyx.indexing.indexing_pipeline import filter_documents
-from onyx.indexing.indexing_pipeline import process_image_sections
 from onyx.indexing.models import ChunkEmbedding
 from onyx.indexing.models import IndexChunk
 from onyx.natural_language_processing.search_nlp_models import (
@@ -172,9 +166,6 @@ def create_test_chunk(
        embeddings=ChunkEmbedding(full_embedding=[], mini_chunk_embeddings=[]),
        title_embedding=None,
        image_file_name=None,
-        chunk_context="",
-        doc_summary="",
-        contextual_rag_reserved_tokens=200,
    )


@@ -258,76 +249,3 @@ def test_get_aggregated_boost_factor_individual_failure() -> None:
        )

    assert "Failed to predict content classification for chunk" in str(exc_info.value)
-
-
-@patch("onyx.llm.utils.GEN_AI_MAX_TOKENS", 4096)
-@pytest.mark.parametrize("enable_contextual_rag", [True, False])
-def test_contextual_rag(
-    embedder: DefaultIndexingEmbedder, enable_contextual_rag: bool
-) -> None:
-    short_section_1 = "This is a short section."
-    long_section = (
-        "This is a long section that should be split into multiple chunks. " * 100
-    )
-    short_section_2 = "This is another short section."
-    short_section_3 = "This is another short section again."
-    short_section_4 = "Final short section."
-    semantic_identifier = "Test Document"
-
-    document = Document(
-        id="test_doc",
-        source=DocumentSource.WEB,
-        semantic_identifier=semantic_identifier,
-        metadata={"tags": ["tag1", "tag2"]},
-        doc_updated_at=None,
-        sections=[
-            TextSection(text=short_section_1, link="link1"),
-            TextSection(text=short_section_2, link="link2"),
-            TextSection(text=long_section, link="link3"),
-            TextSection(text=short_section_3, link="link4"),
-            TextSection(text=short_section_4, link="link5"),
-        ],
-    )
-    indexing_documents = process_image_sections([document])
-
-    mock_llm_invoke_count = 0
-
-    def mock_llm_invoke(self: Any, *args: Any, **kwargs: Any) -> Mock:
-        nonlocal mock_llm_invoke_count
-        mock_llm_invoke_count += 1
-        m = Mock()
-        m.content = f"Test{mock_llm_invoke_count}"
-        return m
-
-    llm_tokenizer = embedder.embedding_model.tokenizer
-
-    mock_llm = Mock()
-    mock_llm.invoke = mock_llm_invoke
-
-    chunker = Chunker(
-        tokenizer=embedder.embedding_model.tokenizer,
-        enable_multipass=False,
-        enable_contextual_rag=enable_contextual_rag,
-    )
-    chunks = chunker.chunk(indexing_documents)
-
-    chunks = add_contextual_summaries(
-        chunks, mock_llm, llm_tokenizer, chunker.chunk_token_limit * 2
-    )
-
-    assert len(chunks) == 5
-    assert short_section_1 in chunks[0].content
-    assert short_section_3 in chunks[-1].content
-    assert short_section_4 in chunks[-1].content
-    assert "tag1" in chunks[0].metadata_suffix_keyword
-    assert "tag2" in chunks[0].metadata_suffix_semantic
-
-    doc_summary = "Test1" if enable_contextual_rag else ""
-    chunk_context = ""
-    count = 2
-    for chunk in chunks:
-        if enable_contextual_rag:
-            chunk_context = f"Test{count}"
-            count += 1
-        assert chunk.doc_summary == doc_summary
-        assert chunk.chunk_context == chunk_context
--- a/backend/tests/unit/onyx/llm/test_chat_llm.py
+++ b/backend/tests/unit/onyx/llm/test_chat_llm.py
@@ -140,12 +140,12 @@ def test_multiple_tool_calls(default_multi_llm: DefaultMultiLLM) -> None:
            ],
            tools=tools,
            tool_choice=None,
-            max_tokens=None,
            stream=False,
            temperature=0.0,  # Default value from GEN_AI_TEMPERATURE
            timeout=30,
            parallel_tool_calls=False,
            mock_response=MOCK_LLM_RESPONSE,
+            max_tokens=None,
        )


@@ -286,10 +286,10 @@ def test_multiple_tool_calls_streaming(default_multi_llm: DefaultMultiLLM) -> No
            ],
            tools=tools,
            tool_choice=None,
-            max_tokens=None,
            stream=True,
            temperature=0.0,  # Default value from GEN_AI_TEMPERATURE
            timeout=30,
            parallel_tool_calls=False,
            mock_response=MOCK_LLM_RESPONSE,
+            max_tokens=None,
        )
--- a/web/src/app/admin/configuration/llm/constants.ts
+++ b/web/src/app/admin/configuration/llm/constants.ts
@@ -1,8 +1,5 @@
 export const LLM_PROVIDERS_ADMIN_URL = "/api/admin/llm/provider";

-export const LLM_CONTEXTUAL_COST_ADMIN_URL =
-  "/api/admin/llm/provider-contextual-cost";
-
 export const EMBEDDING_PROVIDERS_ADMIN_URL =
  "/api/admin/embedding/embedding-provider";

--- a/web/src/app/admin/configuration/search/page.tsx
+++ b/web/src/app/admin/configuration/search/page.tsx
@@ -143,15 +143,6 @@ function Main() {
                      </Text>
                    </div>

-                    <div>
-                      <Text className="font-semibold">Contextual RAG</Text>
-                      <Text className="text-text-700">
-                        {searchSettings.enable_contextual_rag
-                          ? "Enabled"
-                          : "Disabled"}
-                      </Text>
-                    </div>
-
                    <div>
                      <Text className="font-semibold">
                        Disable Reranking for Streaming
--- a/web/src/app/admin/embeddings/interfaces.ts
+++ b/web/src/app/admin/embeddings/interfaces.ts
@@ -26,18 +26,9 @@ export enum EmbeddingPrecision {
  BFLOAT16 = "bfloat16",
 }

-export interface LLMContextualCost {
-  provider: string;
-  model_name: string;
-  cost: number;
-}
-
 export interface AdvancedSearchConfiguration {
  index_name: string | null;
  multipass_indexing: boolean;
-  enable_contextual_rag: boolean;
-  contextual_rag_llm_name: string | null;
-  contextual_rag_llm_provider: string | null;
  multilingual_expansion: string[];
  disable_rerank_for_streaming: boolean;
  api_url: string | null;
--- a/web/src/app/admin/embeddings/pages/AdvancedEmbeddingFormPage.tsx
+++ b/web/src/app/admin/embeddings/pages/AdvancedEmbeddingFormPage.tsx
@@ -3,11 +3,7 @@ import { Formik, Form, FormikProps, FieldArray, Field } from "formik";
 import * as Yup from "yup";
 import { TrashIcon } from "@/components/icons/icons";
 import { FaPlus } from "react-icons/fa";
-import {
-  AdvancedSearchConfiguration,
-  EmbeddingPrecision,
-  LLMContextualCost,
-} from "../interfaces";
+import { AdvancedSearchConfiguration, EmbeddingPrecision } from "../interfaces";
 import {
  BooleanFormField,
  Label,
@@ -16,13 +12,6 @@ import {
 } from "@/components/admin/connectors/Field";
 import NumberInput from "../../connectors/[connector]/pages/ConnectorInput/NumberInput";
 import { StringOrNumberOption } from "@/components/Dropdown";
-import useSWR from "swr";
-import { LLM_CONTEXTUAL_COST_ADMIN_URL } from "../../configuration/llm/constants";
-import { getDisplayNameForModel } from "@/lib/hooks";
-import { errorHandlingFetcher } from "@/lib/fetcher";
-
-// Number of tokens to show cost calculation for
-const COST_CALCULATION_TOKENS = 1_000_000;

 interface AdvancedEmbeddingFormPageProps {
  updateAdvancedEmbeddingDetails: (
@@ -56,66 +45,14 @@ const AdvancedEmbeddingFormPage = forwardRef<
    },
    ref
  ) => {
-    // Fetch contextual costs
-    const { data: contextualCosts, error: costError } = useSWR<
-      LLMContextualCost[]
-    >(LLM_CONTEXTUAL_COST_ADMIN_URL, errorHandlingFetcher);
-
-    const llmOptions: StringOrNumberOption[] = React.useMemo(
-      () =>
-        (contextualCosts || []).map((cost) => {
-          return {
-            name: getDisplayNameForModel(cost.model_name),
-            value: cost.model_name,
-          };
-        }),
-      [contextualCosts]
-    );
-
-    // Helper function to format cost as USD
-    const formatCost = (cost: number) => {
-      return new Intl.NumberFormat("en-US", {
-        style: "currency",
-        currency: "USD",
-      }).format(cost);
-    };
-
-    // Get cost info for selected model
-    const getSelectedModelCost = (modelName: string | null) => {
-      if (!contextualCosts || !modelName) return null;
-      return contextualCosts.find((cost) => cost.model_name === modelName);
-    };
-
-    // Get the current value for the selector based on the parent state
-    const getCurrentLLMValue = React.useMemo(() => {
-      if (!advancedEmbeddingDetails.contextual_rag_llm_name) return null;
-      return advancedEmbeddingDetails.contextual_rag_llm_name;
-    }, [advancedEmbeddingDetails.contextual_rag_llm_name]);
-
    return (
      <div className="py-4 rounded-lg max-w-4xl px-4 mx-auto">
        <Formik
          innerRef={ref}
-          initialValues={{
-            ...advancedEmbeddingDetails,
-            contextual_rag_llm: getCurrentLLMValue,
-          }}
+          initialValues={advancedEmbeddingDetails}
          validationSchema={Yup.object().shape({
            multilingual_expansion: Yup.array().of(Yup.string()),
            multipass_indexing: Yup.boolean(),
-            enable_contextual_rag: Yup.boolean(),
-            contextual_rag_llm: Yup.string()
-              .nullable()
-              .test(
-                "required-if-contextual-rag",
-                "LLM must be selected when Contextual RAG is enabled",
-                function (value) {
-                  const enableContextualRag = this.parent.enable_contextual_rag;
-                  console.log("enableContextualRag", enableContextualRag);
-                  console.log("value", value);
-                  return !enableContextualRag || value !== null;
-                }
-              ),
            disable_rerank_for_streaming: Yup.boolean(),
            num_rerank: Yup.number()
              .required("Number of results to rerank is required")
@@ -142,26 +79,10 @@ const AdvancedEmbeddingFormPage = forwardRef<
          validate={(values) => {
            // Call updateAdvancedEmbeddingDetails for each changed field
            Object.entries(values).forEach(([key, value]) => {
-              if (key === "contextual_rag_llm") {
-                const selectedModel = (contextualCosts || []).find(
-                  (cost) => cost.model_name === value
-                );
-                if (selectedModel) {
-                  updateAdvancedEmbeddingDetails(
-                    "contextual_rag_llm_provider",
-                    selectedModel.provider
-                  );
-                  updateAdvancedEmbeddingDetails(
-                    "contextual_rag_llm_name",
-                    selectedModel.model_name
-                  );
-                }
-              } else {
-                updateAdvancedEmbeddingDetails(
-                  key as keyof AdvancedSearchConfiguration,
-                  value
-                );
-              }
+              updateAdvancedEmbeddingDetails(
+                key as keyof AdvancedSearchConfiguration,
+                value
+              );
            });

            // Run validation and report errors
@@ -175,23 +96,6 @@ const AdvancedEmbeddingFormPage = forwardRef<
                  .shape({
                    multilingual_expansion: Yup.array().of(Yup.string()),
                    multipass_indexing: Yup.boolean(),
-                    enable_contextual_rag: Yup.boolean(),
-                    contextual_rag_llm: Yup.string()
-                      .nullable()
-                      .test(
-                        "required-if-contextual-rag",
-                        "LLM must be selected when Contextual RAG is enabled",
-                        function (value) {
-                          const enableContextualRag =
-                            this.parent.enable_contextual_rag;
-                          console.log(
-                            "enableContextualRag2",
-                            enableContextualRag
-                          );
-                          console.log("value2", value);
-                          return !enableContextualRag || value !== null;
-                        }
-                      ),
                    disable_rerank_for_streaming: Yup.boolean(),
                    num_rerank: Yup.number()
                      .required("Number of results to rerank is required")
@@ -286,56 +190,6 @@ const AdvancedEmbeddingFormPage = forwardRef<
                label="Disable Rerank for Streaming"
                name="disable_rerank_for_streaming"
              />
-              <BooleanFormField
-                subtext="Enable contextual RAG for all chunk sizes."
-                optional
-                label="Contextual RAG"
-                name="enable_contextual_rag"
-              />
-              <div>
-                <SelectorFormField
-                  name="contextual_rag_llm"
-                  label="Contextual RAG LLM"
-                  subtext={
-                    costError
-                      ? "Error loading LLM models. Please try again later."
-                      : !contextualCosts
-                        ? "Loading available LLM models..."
-                        : values.enable_contextual_rag
-                          ? "Select the LLM model to use for contextual RAG processing."
-                          : "Enable Contextual RAG above to select an LLM model."
-                  }
-                  options={llmOptions}
-                  disabled={
-                    !values.enable_contextual_rag ||
-                    !contextualCosts ||
-                    !!costError
-                  }
-                />
-                {values.enable_contextual_rag &&
-                  values.contextual_rag_llm &&
-                  !costError && (
-                    <div className="mt-2 text-sm text-text-600">
-                      {contextualCosts ? (
-                        <>
-                          Estimated cost for processing{" "}
-                          {COST_CALCULATION_TOKENS.toLocaleString()} tokens:{" "}
-                          <span className="font-medium">
-                            {getSelectedModelCost(values.contextual_rag_llm)
-                              ? formatCost(
-                                  getSelectedModelCost(
-                                    values.contextual_rag_llm
-                                  )!.cost
-                                )
-                              : "Cost information not available"}
-                          </span>
-                        </>
-                      ) : (
-                        "Loading cost information..."
-                      )}
-                    </div>
-                  )}
-              </div>
              <NumberInput
                description="Number of results to rerank"
                optional={false}
--- a/web/src/app/admin/embeddings/pages/EmbeddingFormPage.tsx
+++ b/web/src/app/admin/embeddings/pages/EmbeddingFormPage.tsx
@@ -64,9 +64,6 @@ export default function EmbeddingForm() {
    useState<AdvancedSearchConfiguration>({
      index_name: "",
      multipass_indexing: true,
-      enable_contextual_rag: false,
-      contextual_rag_llm_name: null,
-      contextual_rag_llm_provider: null,
      multilingual_expansion: [],
      disable_rerank_for_streaming: false,
      api_url: null,
@@ -155,9 +152,6 @@ export default function EmbeddingForm() {
      setAdvancedEmbeddingDetails({
        index_name: searchSettings.index_name,
        multipass_indexing: searchSettings.multipass_indexing,
-        enable_contextual_rag: searchSettings.enable_contextual_rag,
-        contextual_rag_llm_name: searchSettings.contextual_rag_llm_name,
-        contextual_rag_llm_provider: searchSettings.contextual_rag_llm_provider,
        multilingual_expansion: searchSettings.multilingual_expansion,
        disable_rerank_for_streaming:
          searchSettings.disable_rerank_for_streaming,
@@ -203,9 +197,7 @@ export default function EmbeddingForm() {
    searchSettings?.embedding_precision !=
      advancedEmbeddingDetails.embedding_precision ||
    searchSettings?.reduced_dimension !=
-      advancedEmbeddingDetails.reduced_dimension ||
-    searchSettings?.enable_contextual_rag !=
-      advancedEmbeddingDetails.enable_contextual_rag;
+      advancedEmbeddingDetails.reduced_dimension;

  const updateSearch = useCallback(async () => {
    if (!selectedProvider) {
@@ -392,14 +384,6 @@ export default function EmbeddingForm() {
                    advancedEmbeddingDetails.reduced_dimension && (
                    <li>Reduced dimension modification</li>
                  )}
-                  {(searchSettings?.enable_contextual_rag !=
-                    advancedEmbeddingDetails.enable_contextual_rag ||
-                    searchSettings?.contextual_rag_llm_name !=
-                      advancedEmbeddingDetails.contextual_rag_llm_name ||
-                    searchSettings?.contextual_rag_llm_provider !=
-                      advancedEmbeddingDetails.contextual_rag_llm_provider) && (
-                    <li>Contextual RAG modification</li>
-                  )}
                </ul>
              </div>
            </div>
@@ -487,11 +471,6 @@ export default function EmbeddingForm() {
  };

  const handleReIndex = async () => {
-    console.log("handleReIndex");
-    console.log(selectedProvider);
-    console.log(advancedEmbeddingDetails);
-    console.log(rerankingDetails);
-    console.log(reindexType);
    if (!selectedProvider) {
      return;
    }
--- a/web/src/app/layout.tsx
+++ b/web/src/app/layout.tsx
@@ -20,7 +20,7 @@ import {
 import { fetchAssistantData } from "@/lib/chat/fetchAssistantdata";
 import { AppProvider } from "@/components/context/AppProvider";
 import { PHProvider } from "./providers";
-import { getAuthTypeMetadataSS, getCurrentUserSS } from "@/lib/userSS";
+import { getCurrentUserSS } from "@/lib/userSS";
 import { Suspense } from "react";
 import PostHogPageView from "./PostHogPageView";
 import Script from "next/script";
@@ -70,13 +70,11 @@ export default async function RootLayout({
 }: {
  children: React.ReactNode;
 }) {
-  const [combinedSettings, assistantsData, user, authTypeMetadata] =
-    await Promise.all([
-      fetchSettingsSS(),
-      fetchAssistantData(),
-      getCurrentUserSS(),
-      getAuthTypeMetadataSS(),
-    ]);
+  const [combinedSettings, assistantsData, user] = await Promise.all([
+    fetchSettingsSS(),
+    fetchAssistantData(),
+    getCurrentUserSS(),
+  ]);

  const productGating =
    combinedSettings?.settings.application_status ?? ApplicationStatus.ACTIVE;
@@ -149,7 +147,6 @@ export default async function RootLayout({

  return getPageContent(
    <AppProvider
-      authTypeMetadata={authTypeMetadata}
      user={user}
      settings={combinedSettings}
      assistants={assistants}
--- a/web/src/components/admin/connectors/Field.tsx
+++ b/web/src/components/admin/connectors/Field.tsx
@@ -676,7 +676,6 @@ interface SelectorFormFieldProps {
  includeReset?: boolean;
  fontSize?: "sm" | "md" | "lg";
  small?: boolean;
-  disabled?: boolean;
 }

 export function SelectorFormField({
@@ -692,7 +691,6 @@ export function SelectorFormField({
  includeReset = false,
  fontSize = "md",
  small = false,
-  disabled = false,
 }: SelectorFormFieldProps) {
  const [field] = useField<string>(name);
  const { setFieldValue } = useFormikContext();
@@ -744,9 +742,8 @@ export function SelectorFormField({
                : setFieldValue(name, selected))
          }
          defaultValue={defaultValue}
-          disabled={disabled}
        >
-          <SelectTrigger className={sizeClass.input} disabled={disabled}>
+          <SelectTrigger className={sizeClass.input}>
            <SelectValue placeholder="Select...">
              {currentlySelected?.name || defaultValue || ""}
            </SelectValue>
--- a/web/src/components/context/AppProvider.tsx
+++ b/web/src/components/context/AppProvider.tsx
@@ -7,7 +7,6 @@ import { AssistantsProvider } from "./AssistantsContext";
 import { Persona } from "@/app/admin/assistants/interfaces";
 import { User } from "@/lib/types";
 import { ModalProvider } from "./ModalContext";
-import { AuthTypeMetadata } from "@/lib/userSS";

 interface AppProviderProps {
  children: React.ReactNode;
@@ -16,7 +15,6 @@ interface AppProviderProps {
  assistants: Persona[];
  hasAnyConnectors: boolean;
  hasImageCompatibleModel: boolean;
-  authTypeMetadata: AuthTypeMetadata;
 }

 export const AppProvider = ({
@@ -26,15 +24,10 @@ export const AppProvider = ({
  assistants,
  hasAnyConnectors,
  hasImageCompatibleModel,
-  authTypeMetadata,
 }: AppProviderProps) => {
  return (
    <SettingsProvider settings={settings}>
-      <UserProvider
-        settings={settings}
-        user={user}
-        authTypeMetadata={authTypeMetadata}
-      >
+      <UserProvider settings={settings} user={user}>
        <ProviderContextProvider>
          <AssistantsProvider
            initialAssistants={assistants}
--- a/web/src/components/user/UserProvider.tsx
+++ b/web/src/components/user/UserProvider.tsx
@@ -13,7 +13,6 @@ import { usePostHog } from "posthog-js/react";
 import { CombinedSettings } from "@/app/admin/settings/interfaces";
 import { SettingsContext } from "../settings/SettingsProvider";
 import { useTokenRefresh } from "@/hooks/useTokenRefresh";
-import { AuthTypeMetadata } from "@/lib/userSS";

 interface UserContextType {
  user: User | null;
@@ -34,12 +33,10 @@ interface UserContextType {
 const UserContext = createContext<UserContextType | undefined>(undefined);

 export function UserProvider({
-  authTypeMetadata,
  children,
  user,
  settings,
 }: {
-  authTypeMetadata: AuthTypeMetadata;
  children: React.ReactNode;
  user: User | null;
  settings: CombinedSettings;
@@ -105,7 +102,7 @@ export function UserProvider({
  };

  // Use the custom token refresh hook
-  useTokenRefresh(upToDateUser, authTypeMetadata, fetchUser);
+  useTokenRefresh(upToDateUser, fetchUser);

  const updateUserTemperatureOverrideEnabled = async (enabled: boolean) => {
    try {
--- a/web/src/hooks/useTokenRefresh.ts
+++ b/web/src/hooks/useTokenRefresh.ts
@@ -1,7 +1,6 @@
 import { useState, useEffect, useRef } from "react";
 import { User } from "@/lib/types";
 import { NO_AUTH_USER_ID } from "@/lib/extension/constants";
-import { AuthTypeMetadata } from "@/lib/userSS";

 // Refresh token every 10 minutes (600000ms)
 // This is shorter than the session expiry time to ensure tokens stay valid
@@ -10,7 +9,6 @@ const REFRESH_INTERVAL = 600000;
 //  Custom hook for handling JWT token refresh for current user
 export function useTokenRefresh(
  user: User | null,
-  authTypeMetadata: AuthTypeMetadata,
  onRefreshFail: () => Promise<void>
 ) {
  // Track last refresh time to avoid unnecessary calls
@@ -20,13 +18,7 @@ export function useTokenRefresh(
  const isFirstLoad = useRef(true);

  useEffect(() => {
-    if (
-      !user ||
-      user.id === NO_AUTH_USER_ID ||
-      authTypeMetadata.authType === "oidc" ||
-      authTypeMetadata.authType === "saml"
-    )
-      return;
+    if (!user || user.id === NO_AUTH_USER_ID) return;

    const refreshTokenPeriodically = async () => {
      try {
--- a/web/src/lib/users/fetchUserData.ts
+++ b/web/src/lib/users/fetchUserData.ts
Author	SHA1	Message	Date
pablonyx	1e68232a8f	fix typing	2025-03-30 16:46:55 -07:00
pablonyx	d64f64987c	address nits	2025-03-30 14:43:46 -07:00
pablonyx	0f8ef5277e	k	2025-03-30 13:45:38 -07:00
pablonyx	8a76c539ed	k	2025-03-30 13:41:46 -07:00
pablonyx	4aa47a0264	cleaner handling	2025-03-30 13:40:40 -07:00