chore: Remove end of lived backend routes (#8453)

2026-02-16 23:35:46 +00:00 · 2026-02-13 17:57:06 -08:00
parent d9feaf43a7
commit 89d2759021
26 changed files with 142 additions and 933 deletions
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -27,6 +27,8 @@ class SearchFlowClassificationResponse(BaseModel):
    is_search_flow: bool


+# NOTE: This model is used for the core flow of the Onyx application, any changes to it should be reviewed and approved by an
+# experienced team member. It is very important to 1. avoid bloat and 2. that this remains backwards compatible across versions.
 class SendSearchQueryRequest(BaseModel):
    search_query: str
    filters: BaseFilters | None = None
--- a/backend/ee/onyx/server/query_and_chat/search_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/search_backend.py
@@ -67,6 +67,8 @@ def search_flow_classification(
    return SearchFlowClassificationResponse(is_search_flow=is_search_flow)


+# NOTE: This endpoint is used for the core flow of the Onyx application, any changes to it should be reviewed and approved by an
+# experienced team member. It is very important to 1. avoid bloat and 2. that this remains backwards compatible across versions.
@router.post(
    "/send-search-message",
    response_model=None,
--- a/backend/onyx/chat/chat_utils.py
+++ b/backend/onyx/chat/chat_utils.py
@@ -3,34 +3,26 @@ from collections.abc import Callable
 from typing import cast
 from uuid import UUID

-from fastapi import HTTPException
 from fastapi.datastructures import Headers
 from sqlalchemy.orm import Session

-from onyx.auth.users import is_user_admin
 from onyx.chat.models import ChatHistoryResult
 from onyx.chat.models import ChatLoadedFile
 from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import FileToolMetadata
-from onyx.chat.models import PersonaOverrideConfig
 from onyx.chat.models import ToolCallSimple
 from onyx.configs.constants import DEFAULT_PERSONA_ID
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import TMP_DRALPHA_PERSONA_NAME
-from onyx.context.search.enums import RecencyBiasSetting
 from onyx.db.chat import create_chat_session
 from onyx.db.chat import get_chat_messages_by_session
 from onyx.db.chat import get_or_create_root_message
 from onyx.db.kg_config import get_kg_config_settings
 from onyx.db.kg_config import is_kg_config_settings_enabled_valid
-from onyx.db.llm import fetch_existing_doc_sets
-from onyx.db.llm import fetch_existing_tools
 from onyx.db.models import ChatMessage
 from onyx.db.models import ChatSession
 from onyx.db.models import Persona
 from onyx.db.models import SearchDoc as DbSearchDoc
-from onyx.db.models import Tool
-from onyx.db.models import User
 from onyx.db.models import UserFile
 from onyx.db.projects import check_project_ownership
 from onyx.file_processing.extract_file_text import extract_file_text
@@ -47,9 +39,6 @@ from onyx.prompts.tool_prompts import TOOL_CALL_FAILURE_PROMPT
 from onyx.server.query_and_chat.models import ChatSessionCreationRequest
 from onyx.server.query_and_chat.streaming_models import CitationInfo
 from onyx.tools.models import ToolCallKickoff
-from onyx.tools.tool_implementations.custom.custom_tool import (
-    build_custom_tools_from_openapi_schema_and_headers,
-)
 from onyx.utils.logger import setup_logger
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
 from onyx.utils.timing import log_function_time
@@ -278,70 +267,6 @@ def extract_headers(
    return extracted_headers


-def create_temporary_persona(
-    persona_config: PersonaOverrideConfig, db_session: Session, user: User
-) -> Persona:
-    if not is_user_admin(user):
-        raise HTTPException(
-            status_code=403,
-            detail="User is not authorized to create a persona in one shot queries",
-        )
-
-    """Create a temporary Persona object from the provided configuration."""
-    persona = Persona(
-        name=persona_config.name,
-        description=persona_config.description,
-        num_chunks=persona_config.num_chunks,
-        llm_relevance_filter=persona_config.llm_relevance_filter,
-        llm_filter_extraction=persona_config.llm_filter_extraction,
-        recency_bias=RecencyBiasSetting.BASE_DECAY,
-        llm_model_provider_override=persona_config.llm_model_provider_override,
-        llm_model_version_override=persona_config.llm_model_version_override,
-    )
-
-    if persona_config.prompts:
-        # Use the first prompt from the override config for embedded prompt fields
-        first_prompt = persona_config.prompts[0]
-        persona.system_prompt = first_prompt.system_prompt
-        persona.task_prompt = first_prompt.task_prompt
-        persona.datetime_aware = first_prompt.datetime_aware
-
-    persona.tools = []
-    if persona_config.custom_tools_openapi:
-        from onyx.chat.emitter import get_default_emitter
-
-        for schema in persona_config.custom_tools_openapi:
-            tools = cast(
-                list[Tool],
-                build_custom_tools_from_openapi_schema_and_headers(
-                    tool_id=0,  # dummy tool id
-                    openapi_schema=schema,
-                    emitter=get_default_emitter(),
-                ),
-            )
-            persona.tools.extend(tools)
-
-    if persona_config.tools:
-        tool_ids = [tool.id for tool in persona_config.tools]
-        persona.tools.extend(
-            fetch_existing_tools(db_session=db_session, tool_ids=tool_ids)
-        )
-
-    if persona_config.tool_ids:
-        persona.tools.extend(
-            fetch_existing_tools(
-                db_session=db_session, tool_ids=persona_config.tool_ids
-            )
-        )
-
-    fetched_docs = fetch_existing_doc_sets(
-        db_session=db_session, doc_ids=persona_config.document_set_ids
-    )
-    persona.document_sets = fetched_docs
-
-    return persona
-
-
 def process_kg_commands(
    message: str, persona_name: str, tenant_id: str, db_session: Session  # noqa: ARG001
 ) -> None:
--- a/backend/onyx/chat/models.py
+++ b/backend/onyx/chat/models.py
@@ -1,17 +1,13 @@
-from collections.abc import Callable
 from collections.abc import Iterator
-from enum import Enum
 from typing import Any
 from uuid import UUID

 from pydantic import BaseModel
-from pydantic import Field

 from onyx.configs.constants import MessageType
-from onyx.context.search.enums import SearchType
 from onyx.context.search.models import SearchDoc
-from onyx.file_store.models import FileDescriptor
 from onyx.file_store.models import InMemoryChatFile
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
 from onyx.server.query_and_chat.streaming_models import CitationInfo
 from onyx.server.query_and_chat.streaming_models import GeneratedImage
 from onyx.server.query_and_chat.streaming_models import Packet
@@ -20,54 +16,6 @@ from onyx.tools.models import ToolCallKickoff
 from onyx.tools.tool_implementations.custom.base_tool_types import ToolResultType


-class StreamStopReason(Enum):
-    CONTEXT_LENGTH = "context_length"
-    CANCELLED = "cancelled"
-    FINISHED = "finished"
-
-
-class StreamType(Enum):
-    SUB_QUESTIONS = "sub_questions"
-    SUB_ANSWER = "sub_answer"
-    MAIN_ANSWER = "main_answer"
-
-
-class StreamStopInfo(BaseModel):
-    stop_reason: StreamStopReason
-
-    stream_type: StreamType = StreamType.MAIN_ANSWER
-
-    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
-        data = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
-        data["stop_reason"] = self.stop_reason.name
-        return data
-
-
-class UserKnowledgeFilePacket(BaseModel):
-    user_files: list[FileDescriptor]
-
-
-class RelevanceAnalysis(BaseModel):
-    relevant: bool
-    content: str | None = None
-
-
-class DocumentRelevance(BaseModel):
-    """Contains all relevance information for a given search"""
-
-    relevance_summaries: dict[str, RelevanceAnalysis]
-
-
-class OnyxAnswerPiece(BaseModel):
-    # A small piece of a complete answer. Used for streaming back answers.
-    answer_piece: str | None  # if None, specifies the end of an Answer
-
-
-class MessageResponseIDInfo(BaseModel):
-    user_message_id: int | None
-    reserved_assistant_message_id: int
-
-
 class StreamingError(BaseModel):
    error: str
    stack_trace: str | None = None
@@ -78,23 +26,11 @@ class StreamingError(BaseModel):
    details: dict | None = None  # Additional context (tool name, model name, etc.)


-class OnyxAnswer(BaseModel):
-    answer: str | None
-
-
-class FileChatDisplay(BaseModel):
-    file_ids: list[str]
-
-
 class CustomToolResponse(BaseModel):
    response: ToolResultType
    tool_name: str


-class ToolConfig(BaseModel):
-    id: int
-
-
 class ProjectSearchConfig(BaseModel):
    """Configuration for search tool availability in project context."""

@@ -102,83 +38,15 @@ class ProjectSearchConfig(BaseModel):
    disable_forced_tool: bool


-class PromptOverrideConfig(BaseModel):
-    name: str
-    description: str = ""
-    system_prompt: str
-    task_prompt: str = ""
-    datetime_aware: bool = True
-    include_citations: bool = True
-
-
-class PersonaOverrideConfig(BaseModel):
-    name: str
-    description: str
-    search_type: SearchType = SearchType.SEMANTIC
-    num_chunks: float | None = None
-    llm_relevance_filter: bool = False
-    llm_filter_extraction: bool = False
-    llm_model_provider_override: str | None = None
-    llm_model_version_override: str | None = None
-
-    prompts: list[PromptOverrideConfig] = Field(default_factory=list)
-    # Note: prompt_ids removed - prompts are now embedded in personas
-
-    document_set_ids: list[int] = Field(default_factory=list)
-    tools: list[ToolConfig] = Field(default_factory=list)
-    tool_ids: list[int] = Field(default_factory=list)
-    custom_tools_openapi: list[dict[str, Any]] = Field(default_factory=list)
-
-
-AnswerQuestionPossibleReturn = (
-    OnyxAnswerPiece
-    | CitationInfo
-    | FileChatDisplay
-    | CustomToolResponse
-    | StreamingError
-    | StreamStopInfo
-)
-
-
 class CreateChatSessionID(BaseModel):
    chat_session_id: UUID


-AnswerQuestionStreamReturn = Iterator[AnswerQuestionPossibleReturn]
-
-
-class LLMMetricsContainer(BaseModel):
-    prompt_tokens: int
-    response_tokens: int
-
-
-StreamProcessor = Callable[[Iterator[str]], AnswerQuestionStreamReturn]
-
-
-AnswerStreamPart = (
-    Packet
-    | StreamStopInfo
-    | MessageResponseIDInfo
-    | StreamingError
-    | UserKnowledgeFilePacket
-    | CreateChatSessionID
-)
+AnswerStreamPart = Packet | MessageResponseIDInfo | StreamingError | CreateChatSessionID

 AnswerStream = Iterator[AnswerStreamPart]


-class ChatBasicResponse(BaseModel):
-    # This is built piece by piece, any of these can be None as the flow could break
-    answer: str
-    answer_citationless: str
-
-    top_documents: list[SearchDoc]
-
-    error_msg: str | None
-    message_id: int
-    citation_info: list[CitationInfo]
-
-
 class ToolCallResponse(BaseModel):
    """Tool call with full details for non-streaming response."""

@@ -191,8 +59,23 @@ class ToolCallResponse(BaseModel):
    pre_reasoning: str | None = None


+class ChatBasicResponse(BaseModel):
+    # This is built piece by piece, any of these can be None as the flow could break
+    answer: str
+    answer_citationless: str
+
+    top_documents: list[SearchDoc]
+
+    error_msg: str | None
+    message_id: int
+    citation_info: list[CitationInfo]
+
+
 class ChatFullResponse(BaseModel):
-    """Complete non-streaming response with all available data."""
+    """Complete non-streaming response with all available data.
+    NOTE: This model is used for the core flow of the Onyx application, any changes to it should be reviewed and approved by an
+    experienced team member. It is very important to 1. avoid bloat and 2. that this remains backwards compatible across versions.
+    """

    # Core response fields
    answer: str
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -37,7 +37,6 @@ from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import CreateChatSessionID
 from onyx.chat.models import ExtractedProjectFiles
 from onyx.chat.models import FileToolMetadata
-from onyx.chat.models import MessageResponseIDInfo
 from onyx.chat.models import ProjectFileMetadata
 from onyx.chat.models import ProjectSearchConfig
 from onyx.chat.models import StreamingError
@@ -81,8 +80,7 @@ from onyx.llm.utils import litellm_exception_to_error_msg
 from onyx.onyxbot.slack.models import SlackContext
 from onyx.redis.redis_pool import get_redis_client
 from onyx.server.query_and_chat.models import AUTO_PLACE_AFTER_LATEST_MESSAGE
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import OptionalSearchSetting
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
 from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
@@ -1027,68 +1025,6 @@ def llm_loop_completion_handle(
        )


-def stream_chat_message_objects(
-    new_msg_req: CreateChatMessageRequest,
-    user: User,
-    db_session: Session,
-    # if specified, uses the last user message and does not create a new user message based
-    # on the `new_msg_req.message`. Currently, requires a state where the last message is a
-    litellm_additional_headers: dict[str, str] | None = None,
-    custom_tool_additional_headers: dict[str, str] | None = None,
-    bypass_acl: bool = False,
-    # Additional context that should be included in the chat history, for example:
-    # Slack threads where the conversation cannot be represented by a chain of User/Assistant
-    # messages. Both of the below are used for Slack
-    # NOTE: is not stored in the database, only passed in to the LLM as context
-    additional_context: str | None = None,
-    # Slack context for federated Slack search
-    slack_context: SlackContext | None = None,
-) -> AnswerStream:
-    forced_tool_id = (
-        new_msg_req.forced_tool_ids[0] if new_msg_req.forced_tool_ids else None
-    )
-    if (
-        new_msg_req.retrieval_options
-        and new_msg_req.retrieval_options.run_search == OptionalSearchSetting.ALWAYS
-    ):
-        all_tools = get_tools(db_session)
-
-        search_tool_id = next(
-            (tool.id for tool in all_tools if tool.in_code_tool_id == SEARCH_TOOL_ID),
-            None,
-        )
-        forced_tool_id = search_tool_id
-
-    translated_new_msg_req = SendMessageRequest(
-        message=new_msg_req.message,
-        llm_override=new_msg_req.llm_override,
-        mock_llm_response=new_msg_req.mock_llm_response,
-        allowed_tool_ids=new_msg_req.allowed_tool_ids,
-        forced_tool_id=forced_tool_id,
-        file_descriptors=new_msg_req.file_descriptors,
-        internal_search_filters=(
-            new_msg_req.retrieval_options.filters
-            if new_msg_req.retrieval_options
-            else None
-        ),
-        deep_research=new_msg_req.deep_research,
-        parent_message_id=new_msg_req.parent_message_id,
-        chat_session_id=new_msg_req.chat_session_id,
-        origin=new_msg_req.origin,
-        include_citations=new_msg_req.include_citations,
-    )
-    return handle_stream_message_objects(
-        new_msg_req=translated_new_msg_req,
-        user=user,
-        db_session=db_session,
-        litellm_additional_headers=litellm_additional_headers,
-        custom_tool_additional_headers=custom_tool_additional_headers,
-        bypass_acl=bypass_acl,
-        additional_context=additional_context,
-        slack_context=slack_context,
-    )
-
-
 def remove_answer_citations(answer: str) -> str:
    pattern = r"\s*\[\[\d+\]\]\(http[s]?://[^\s]+\)"

--- a/backend/onyx/context/search/models.py
+++ b/backend/onyx/context/search/models.py
@@ -6,7 +6,6 @@ from uuid import UUID

 from pydantic import BaseModel
 from pydantic import Field
-from pydantic import field_validator

 from onyx.configs.constants import DocumentSource
 from onyx.db.models import SearchSettings
@@ -97,21 +96,6 @@ class IndexFilters(BaseFilters, UserFileFilters, AssistantKnowledgeFilters):
    tenant_id: str | None = None


-class ChunkContext(BaseModel):
-    # If not specified (None), picked up from Persona settings if there is space
-    # if specified (even if 0), it always uses the specified number of chunks above and below
-    chunks_above: int | None = None
-    chunks_below: int | None = None
-    full_doc: bool = False
-
-    @field_validator("chunks_above", "chunks_below")
-    @classmethod
-    def check_non_negative(cls, value: int, field: Any) -> int:
-        if value is not None and value < 0:
-            raise ValueError(f"{field.name} must be non-negative")
-        return value
-
-
 class BasicChunkRequest(BaseModel):
    query: str

--- a/backend/onyx/db/chat.py
+++ b/backend/onyx/db/chat.py
@@ -19,7 +19,6 @@ from sqlalchemy.exc import MultipleResultsFound
 from sqlalchemy.orm import selectinload
 from sqlalchemy.orm import Session

-from onyx.chat.models import DocumentRelevance
 from onyx.configs.chat_configs import HARD_DELETE_CHATS
 from onyx.configs.constants import MessageType
 from onyx.context.search.models import InferenceSection
@@ -672,27 +671,6 @@ def set_as_latest_chat_message(
    db_session.commit()


-def update_search_docs_table_with_relevance(
-    db_session: Session,
-    reference_db_search_docs: list[DBSearchDoc],
-    relevance_summary: DocumentRelevance,
-) -> None:
-    for search_doc in reference_db_search_docs:
-        relevance_data = relevance_summary.relevance_summaries.get(
-            search_doc.document_id
-        )
-        if relevance_data is not None:
-            db_session.execute(
-                update(DBSearchDoc)
-                .where(DBSearchDoc.id == search_doc.id)
-                .values(
-                    is_relevant=relevance_data.relevant,
-                    relevance_explanation=relevance_data.content,
-                )
-            )
-    db_session.commit()
-
-
 def _sanitize_for_postgres(value: str) -> str:
    """Remove NUL (0x00) characters from strings as PostgreSQL doesn't allow them."""
    sanitized = value.replace("\x00", "")
--- a/backend/onyx/evals/eval.py
+++ b/backend/onyx/evals/eval.py
@@ -4,23 +4,21 @@ from collections.abc import Generator
 from contextlib import contextmanager
 from typing import Any

-from pydantic import BaseModel
 from sqlalchemy import Engine
 from sqlalchemy import event
 from sqlalchemy.orm import Session
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.orm.session import SessionTransaction

-from onyx.chat.models import MessageResponseIDInfo
-from onyx.chat.models import StreamingError
-from onyx.chat.process_message import AnswerStream
+from onyx.chat.chat_state import ChatStateContainer
+from onyx.chat.models import ChatFullResponse
+from onyx.chat.process_message import gather_stream_full
 from onyx.chat.process_message import handle_stream_message_objects
-from onyx.chat.process_message import remove_answer_citations
-from onyx.chat.process_message import stream_chat_message_objects
 from onyx.configs.constants import DEFAULT_PERSONA_ID
 from onyx.db.chat import create_chat_session
 from onyx.db.engine.sql_engine import get_sqlalchemy_engine
 from onyx.db.users import get_user_by_email
+from onyx.evals.models import ChatFullEvalResult
 from onyx.evals.models import EvalationAck
 from onyx.evals.models import EvalConfigurationOptions
 from onyx.evals.models import EvalMessage
@@ -33,18 +31,7 @@ from onyx.evals.provider import get_provider
 from onyx.llm.override_models import LLMOverride
 from onyx.server.query_and_chat.models import AUTO_PLACE_AFTER_LATEST_MESSAGE
 from onyx.server.query_and_chat.models import ChatSessionCreationRequest
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import RetrievalDetails
 from onyx.server.query_and_chat.models import SendMessageRequest
-from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
-from onyx.server.query_and_chat.streaming_models import AgentResponseStart
-from onyx.server.query_and_chat.streaming_models import CitationInfo
-from onyx.server.query_and_chat.streaming_models import CustomToolStart
-from onyx.server.query_and_chat.streaming_models import ImageGenerationToolStart
-from onyx.server.query_and_chat.streaming_models import OpenUrlStart
-from onyx.server.query_and_chat.streaming_models import Packet
-from onyx.server.query_and_chat.streaming_models import PythonToolStart
-from onyx.server.query_and_chat.streaming_models import SearchToolStart
 from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import get_current_tenant_id

@@ -87,193 +74,29 @@ def isolated_ephemeral_session_factory(
        conn.close()


-class GatherStreamResult(BaseModel):
-    """Result of gathering a stream with tool call information."""
-
-    answer: str
-    answer_citationless: str
-    tools_called: list[str]
-    tool_call_details: list[dict[str, Any]]
-    message_id: int
-    error_msg: str | None = None
-    citations: list[CitationInfo] = []
-    timings: EvalTimings | None = None
-
-
-def gather_stream_with_tools(packets: AnswerStream) -> GatherStreamResult:
-    """
-    Gather streaming packets and extract both answer content and tool call information.
-
-    Returns a GatherStreamResult containing the answer and all tools that were called.
-    """
-    stream_start_time = time.time()
-
-    answer: str | None = None
-    citations: list[CitationInfo] = []
-    error_msg: str | None = None
-    message_id: int | None = None
-    tools_called: list[str] = []
-    tool_call_details: list[dict[str, Any]] = []
-
-    # Timing tracking
-    first_token_time: float | None = None
-    tool_start_times: dict[str, float] = {}  # tool_name -> start time
-    tool_execution_ms: dict[str, float] = {}  # tool_name -> duration in ms
-    current_tool: str | None = None
-
-    def _finalize_tool_timing(tool_name: str) -> None:
-        """Record the duration for a tool that just finished."""
-        if tool_name in tool_start_times:
-            duration_ms = (time.time() - tool_start_times[tool_name]) * 1000
-            tool_execution_ms[tool_name] = duration_ms
-
-    for packet in packets:
-        if isinstance(packet, Packet):
-            obj = packet.obj
-
-            # Handle answer content
-            if isinstance(obj, AgentResponseStart):
-                # When answer starts, finalize any in-progress tool
-                if current_tool:
-                    _finalize_tool_timing(current_tool)
-                    current_tool = None
-            elif isinstance(obj, AgentResponseDelta):
-                if answer is None:
-                    answer = ""
-                    first_token_time = time.time()
-                if obj.content:
-                    answer += obj.content
-            elif isinstance(obj, CitationInfo):
-                citations.append(obj)
-
-            # Track tool calls with timing
-            elif isinstance(obj, SearchToolStart):
-                # Finalize any previous tool
-                if current_tool:
-                    _finalize_tool_timing(current_tool)
-
-                tool_name = "WebSearchTool" if obj.is_internet_search else "SearchTool"
-                current_tool = tool_name
-                tool_start_times[tool_name] = time.time()
-                tools_called.append(tool_name)
-                tool_call_details.append(
-                    {
-                        "tool_name": tool_name,
-                        "tool_type": "search",
-                        "is_internet_search": obj.is_internet_search,
-                    }
-                )
-            elif isinstance(obj, ImageGenerationToolStart):
-                if current_tool:
-                    _finalize_tool_timing(current_tool)
-
-                tool_name = "ImageGenerationTool"
-                current_tool = tool_name
-                tool_start_times[tool_name] = time.time()
-                tools_called.append(tool_name)
-                tool_call_details.append(
-                    {
-                        "tool_name": tool_name,
-                        "tool_type": "image_generation",
-                    }
-                )
-            elif isinstance(obj, PythonToolStart):
-                if current_tool:
-                    _finalize_tool_timing(current_tool)
-
-                tool_name = "PythonTool"
-                current_tool = tool_name
-                tool_start_times[tool_name] = time.time()
-                tools_called.append(tool_name)
-                tool_call_details.append(
-                    {
-                        "tool_name": tool_name,
-                        "tool_type": "python",
-                        "code": obj.code,
-                    }
-                )
-            elif isinstance(obj, OpenUrlStart):
-                if current_tool:
-                    _finalize_tool_timing(current_tool)
-
-                tool_name = "OpenURLTool"
-                current_tool = tool_name
-                tool_start_times[tool_name] = time.time()
-                tools_called.append(tool_name)
-                tool_call_details.append(
-                    {
-                        "tool_name": tool_name,
-                        "tool_type": "open_url",
-                    }
-                )
-            elif isinstance(obj, CustomToolStart):
-                if current_tool:
-                    _finalize_tool_timing(current_tool)
-
-                tool_name = obj.tool_name
-                current_tool = tool_name
-                tool_start_times[tool_name] = time.time()
-                tools_called.append(tool_name)
-                tool_call_details.append(
-                    {
-                        "tool_name": tool_name,
-                        "tool_type": "custom",
-                    }
-                )
-
-        elif isinstance(packet, StreamingError):
-            logger.warning(f"Streaming error during eval: {packet.error}")
-            error_msg = packet.error
-        elif isinstance(packet, MessageResponseIDInfo):
-            message_id = packet.reserved_assistant_message_id
-
-    # Finalize any remaining tool timing
-    if current_tool:
-        _finalize_tool_timing(current_tool)
-
+def _chat_full_response_to_eval_result(
+    full: ChatFullResponse,
+    stream_start_time: float,
+) -> ChatFullEvalResult:
+    """Map ChatFullResponse from gather_stream_full to eval result components."""
+    tools_called = [tc.tool_name for tc in full.tool_calls]
+    tool_call_details: list[dict[str, Any]] = [
+        {"tool_name": tc.tool_name, "tool_arguments": tc.tool_arguments}
+        for tc in full.tool_calls
+    ]
    stream_end_time = time.time()
-
-    if message_id is None:
-        # If we got a streaming error, include it in the exception
-        if error_msg:
-            raise ValueError(f"Message ID is required. Stream error: {error_msg}")
-        raise ValueError(
-            f"Message ID is required. No MessageResponseIDInfo received. "
-            f"Tools called: {tools_called}"
-        )
-
-    # Allow empty answers for tool-only turns (e.g., in multi-turn evals)
-    # Some turns may only execute tools without generating a text response
-    if answer is None:
-        logger.warning(
-            "No answer content generated. Tools called: %s. "
-            "This may be expected for tool-only turns.",
-            tools_called,
-        )
-        answer = ""
-
-    # Calculate timings
    total_ms = (stream_end_time - stream_start_time) * 1000
-    first_token_ms = (
-        (first_token_time - stream_start_time) * 1000 if first_token_time else None
-    )
-    stream_processing_ms = (stream_end_time - stream_start_time) * 1000
-
    timings = EvalTimings(
        total_ms=total_ms,
-        llm_first_token_ms=first_token_ms,
-        tool_execution_ms=tool_execution_ms,
-        stream_processing_ms=stream_processing_ms,
+        llm_first_token_ms=None,
+        tool_execution_ms={},
+        stream_processing_ms=total_ms,
    )
-
-    return GatherStreamResult(
-        answer=answer,
-        answer_citationless=remove_answer_citations(answer),
+    return ChatFullEvalResult(
+        answer=full.answer,
        tools_called=tools_called,
        tool_call_details=tool_call_details,
-        message_id=message_id,
-        error_msg=error_msg,
-        citations=citations,
+        citations=full.citation_info,
        timings=timings,
    )

@@ -413,14 +236,17 @@ def _get_answer_with_tools(
                ),
            )

+            stream_start_time = time.time()
+            state_container = ChatStateContainer()
            packets = handle_stream_message_objects(
                new_msg_req=request,
                user=user,
                db_session=db_session,
+                external_state_container=state_container,
            )
+            full = gather_stream_full(packets, state_container)

-            # Gather stream with tool call tracking
-            result = gather_stream_with_tools(packets)
+            result = _chat_full_response_to_eval_result(full, stream_start_time)

            # Evaluate tool assertions
            assertion_passed, assertion_details = evaluate_tool_assertions(
@@ -551,30 +377,30 @@ def _get_multi_turn_answer_with_tools(
                        ),
                    )

-                # Create request for this turn
+                # Create request for this turn using SendMessageRequest (same API as handle_stream_message_objects)
                # Use AUTO_PLACE_AFTER_LATEST_MESSAGE to chain messages
-                request = CreateChatMessageRequest(
+                forced_tool_id = forced_tool_ids[0] if forced_tool_ids else None
+                request = SendMessageRequest(
                    chat_session_id=chat_session_id,
                    parent_message_id=AUTO_PLACE_AFTER_LATEST_MESSAGE,
                    message=msg.message,
-                    file_descriptors=[],
-                    search_doc_ids=None,
-                    retrieval_options=RetrievalDetails(),
                    llm_override=llm_override,
-                    persona_override_config=full_configuration.persona_override_config,
-                    skip_gen_ai_answer_generation=False,
                    allowed_tool_ids=full_configuration.allowed_tool_ids,
-                    forced_tool_ids=forced_tool_ids or None,
+                    forced_tool_id=forced_tool_id,
                )

-                # Stream and gather results for this turn
-                packets = stream_chat_message_objects(
+                # Stream and gather results for this turn via handle_stream_message_objects + gather_stream_full
+                stream_start_time = time.time()
+                state_container = ChatStateContainer()
+                packets = handle_stream_message_objects(
                    new_msg_req=request,
                    user=user,
                    db_session=db_session,
+                    external_state_container=state_container,
                )
+                full = gather_stream_full(packets, state_container)

-                result = gather_stream_with_tools(packets)
+                result = _chat_full_response_to_eval_result(full, stream_start_time)

                # Evaluate tool assertions for this turn
                assertion_passed, assertion_details = evaluate_tool_assertions(
--- a/backend/onyx/evals/models.py
+++ b/backend/onyx/evals/models.py
@@ -7,9 +7,6 @@ from pydantic import BaseModel
 from pydantic import Field
 from sqlalchemy.orm import Session

-from onyx.chat.models import PersonaOverrideConfig
-from onyx.chat.models import PromptOverrideConfig
-from onyx.chat.models import ToolConfig
 from onyx.db.tools import get_builtin_tool
 from onyx.llm.override_models import LLMOverride
 from onyx.server.query_and_chat.streaming_models import CitationInfo
@@ -34,6 +31,16 @@ class EvalTimings(BaseModel):
    stream_processing_ms: float | None = None  # Time to process the stream


+class ChatFullEvalResult(BaseModel):
+    """Raw eval components from ChatFullResponse (before tool assertions)."""
+
+    answer: str
+    tools_called: list[str]
+    tool_call_details: list[dict[str, Any]]
+    citations: list[CitationInfo]
+    timings: EvalTimings
+
+
 class EvalToolResult(BaseModel):
    """Result of a single eval with tool call information."""

@@ -72,8 +79,6 @@ class MultiTurnEvalResult(BaseModel):


 class EvalConfiguration(BaseModel):
-    builtin_tool_types: list[str] = Field(default_factory=list)
-    persona_override_config: PersonaOverrideConfig | None = None
    llm: LLMOverride = Field(default_factory=LLMOverride)
    search_permissions_email: str
    allowed_tool_ids: list[int]
@@ -81,7 +86,6 @@ class EvalConfiguration(BaseModel):

 class EvalConfigurationOptions(BaseModel):
    builtin_tool_types: list[str] = list(BUILT_IN_TOOL_MAP.keys())
-    persona_override_config: PersonaOverrideConfig | None = None
    llm: LLMOverride = LLMOverride(
        model_provider=None,
        model_version="gpt-4o",
@@ -96,26 +100,7 @@ class EvalConfigurationOptions(BaseModel):
    experiment_name: str | None = None

    def get_configuration(self, db_session: Session) -> EvalConfiguration:
-        persona_override_config = self.persona_override_config or PersonaOverrideConfig(
-            name="Eval",
-            description="A persona for evaluation",
-            tools=[
-                ToolConfig(id=get_builtin_tool(db_session, BUILT_IN_TOOL_MAP[tool]).id)
-                for tool in self.builtin_tool_types
-            ],
-            prompts=[
-                PromptOverrideConfig(
-                    name="Default",
-                    description="Default prompt for evaluation",
-                    system_prompt="You are a helpful assistant.",
-                    task_prompt="",
-                    datetime_aware=True,
-                )
-            ],
-        )
-
        return EvalConfiguration(
-            persona_override_config=persona_override_config,
            llm=self.llm,
            search_permissions_email=self.search_permissions_email,
            allowed_tool_ids=[
--- a/backend/onyx/llm/factory.py
+++ b/backend/onyx/llm/factory.py
@@ -2,7 +2,6 @@ from collections.abc import Callable
 from typing import Any

 from onyx.auth.schemas import UserRole
-from onyx.chat.models import PersonaOverrideConfig
 from onyx.configs.model_configs import GEN_AI_TEMPERATURE
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import LLMModelFlowType
@@ -77,7 +76,7 @@ def _build_model_kwargs(


 def get_llm_for_persona(
-    persona: Persona | PersonaOverrideConfig | None,
+    persona: Persona | None,
    user: User,
    llm_override: LLMOverride | None = None,
    additional_headers: dict[str, str] | None = None,
@@ -102,20 +101,16 @@ def get_llm_for_persona(
        if not provider_model:
            raise ValueError("No LLM provider found")

-        # Only check access control for database Persona entities, not PersonaOverrideConfig
-        # PersonaOverrideConfig is used for temporary overrides and doesn't have access restrictions
-        persona_model = persona if isinstance(persona, Persona) else None
-
        # Fetch user group IDs for access control check
        user_group_ids = fetch_user_group_ids(db_session, user)

        if not can_user_access_llm_provider(
-            provider_model, user_group_ids, persona_model, user.role == UserRole.ADMIN
+            provider_model, user_group_ids, persona, user.role == UserRole.ADMIN
        ):
            logger.warning(
                "User %s with persona %s cannot access provider %s. Falling back to default provider.",
                user.id,
-                getattr(persona_model, "id", None),
+                persona.id,
                provider_model.name,
            )
            return get_default_llm(
--- a/backend/onyx/server/features/build/api/messages_api.py
+++ b/backend/onyx/server/features/build/api/messages_api.py
@@ -85,7 +85,7 @@ def send_message(
    Enforces rate limiting before executing the agent (via dependency).
    Returns a Server-Sent Events (SSE) stream with the agent's response.

-    Follows the same pattern as /chat/send-message for consistency.
+    Follows the same pattern as /chat/send-chat-message for consistency.
    """

    def stream_generator() -> Generator[str, None, None]:
--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@@ -30,7 +30,6 @@ from onyx.chat.models import ChatFullResponse
 from onyx.chat.models import CreateChatSessionID
 from onyx.chat.process_message import gather_stream_full
 from onyx.chat.process_message import handle_stream_message_objects
-from onyx.chat.process_message import stream_chat_message_objects
 from onyx.chat.prompt_utils import get_default_base_system_prompt
 from onyx.chat.stop_signal_checker import set_fence
 from onyx.configs.app_configs import WEB_DOMAIN
@@ -40,8 +39,6 @@ from onyx.configs.constants import MilestoneRecordType
 from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS
 from onyx.db.chat import add_chats_to_session_from_slack_thread
-from onyx.db.chat import create_chat_session
-from onyx.db.chat import create_new_chat_message
 from onyx.db.chat import delete_all_chat_sessions_for_user
 from onyx.db.chat import delete_chat_session
 from onyx.db.chat import duplicate_chat_session_for_user_from_slack
@@ -49,7 +46,6 @@ from onyx.db.chat import get_chat_message
 from onyx.db.chat import get_chat_messages_by_session
 from onyx.db.chat import get_chat_session_by_id
 from onyx.db.chat import get_chat_sessions_by_user
-from onyx.db.chat import get_or_create_root_message
 from onyx.db.chat import set_as_latest_chat_message
 from onyx.db.chat import translate_db_message_to_chat_message_detail
 from onyx.db.chat import update_chat_session
@@ -71,7 +67,6 @@ from onyx.llm.constants import LlmProviderNames
 from onyx.llm.factory import get_default_llm
 from onyx.llm.factory import get_llm_for_persona
 from onyx.llm.factory import get_llm_token_counter
-from onyx.natural_language_processing.utils import get_tokenizer
 from onyx.redis.redis_pool import get_redis_client
 from onyx.secondary_llm_flows.chat_session_naming import generate_chat_session_name
 from onyx.server.api_key_usage import check_api_key_usage
@@ -86,10 +81,7 @@ from onyx.server.query_and_chat.models import ChatSessionGroup
 from onyx.server.query_and_chat.models import ChatSessionsResponse
 from onyx.server.query_and_chat.models import ChatSessionSummary
 from onyx.server.query_and_chat.models import ChatSessionUpdateRequest
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import LLMOverride
 from onyx.server.query_and_chat.models import MessageOrigin
-from onyx.server.query_and_chat.models import PromptOverride
 from onyx.server.query_and_chat.models import RenameChatSessionResponse
 from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.models import UpdateChatSessionTemperatureRequest
@@ -503,71 +495,8 @@ def delete_chat_session_by_id(
        raise HTTPException(status_code=400, detail=str(e))


-# WARNING: this endpoint is deprecated and will be removed soon. Use the new send-chat-message endpoint instead.
-@router.post("/send-message")
-def handle_new_chat_message(
-    chat_message_req: CreateChatMessageRequest,
-    request: Request,
-    user: User = Depends(current_chat_accessible_user),
-    _rate_limit_check: None = Depends(check_token_rate_limits),
-    _api_key_usage_check: None = Depends(check_api_key_usage),
-) -> StreamingResponse:
-    """
-    This endpoint is both used for all the following purposes:
-    - Sending a new message in the session
-    - Regenerating a message in the session (just send the same one again)
-    - Editing a message (similar to regenerating but sending a different message)
-    - Kicking off a seeded chat session (set `use_existing_user_message`)
-
-    Assumes that previous messages have been set as the latest to minimize overhead.
-
-    Args:
-        chat_message_req (CreateChatMessageRequest): Details about the new chat message.
-        request (Request): The current HTTP request context.
-        user (User): The current user, obtained via dependency injection.
-        _ (None): Rate limit check is run if user/group/global rate limits are enabled.
-
-    Returns:
-        StreamingResponse: Streams the response to the new chat message.
-    """
-    tenant_id = get_current_tenant_id()
-    logger.debug(f"Received new chat message: {chat_message_req.message}")
-
-    if not chat_message_req.message and not chat_message_req.use_existing_user_message:
-        raise HTTPException(status_code=400, detail="Empty chat message is invalid")
-
-    mt_cloud_telemetry(
-        tenant_id=tenant_id,
-        distinct_id=tenant_id if user.is_anonymous else user.email,
-        event=MilestoneRecordType.RAN_QUERY,
-    )
-
-    def stream_generator() -> Generator[str, None, None]:
-        try:
-            with get_session_with_current_tenant() as db_session:
-                for obj in stream_chat_message_objects(
-                    new_msg_req=chat_message_req,
-                    user=user,
-                    db_session=db_session,
-                    litellm_additional_headers=extract_headers(
-                        request.headers, LITELLM_PASS_THROUGH_HEADERS
-                    ),
-                    custom_tool_additional_headers=get_custom_tool_additional_request_headers(
-                        request.headers
-                    ),
-                ):
-                    yield get_json_line(obj.model_dump())
-
-        except Exception as e:
-            logger.exception("Error in chat message streaming")
-            yield json.dumps({"error": str(e)})
-
-        finally:
-            logger.debug("Stream generator finished")
-
-    return StreamingResponse(stream_generator(), media_type="text/event-stream")
-
-
+# NOTE: This endpoint is extremely central to the application, any changes to it should be reviewed and approved by an experienced
+# team member. It is very important to 1. avoid bloat and 2. that this remains backwards compatible across versions.
@router.post(
    "/send-chat-message",
    response_model=ChatFullResponse,
@@ -815,77 +744,6 @@ def get_available_context_tokens_for_session(
 """Endpoints for chat seeding"""


-class ChatSeedRequest(BaseModel):
-    # standard chat session stuff
-    persona_id: int
-
-    # overrides / seeding
-    llm_override: LLMOverride | None = None
-    prompt_override: PromptOverride | None = None
-    description: str | None = None
-    message: str | None = None
-
-    # TODO: support this
-    # initial_message_retrieval_options: RetrievalDetails | None = None
-
-
-class ChatSeedResponse(BaseModel):
-    redirect_url: str
-
-
-@router.post("/seed-chat-session", tags=PUBLIC_API_TAGS)
-def seed_chat(
-    chat_seed_request: ChatSeedRequest,
-    # NOTE: This endpoint is designed for programmatic access (API keys, external services)
-    # rather than authenticated user sessions. The user parameter is used for access control
-    # but the created chat session is "unassigned" (user_id=None) until a user visits the web UI.
-    # This allows external systems to pre-seed chat sessions that users can then access.
-    user: User = Depends(current_chat_accessible_user),
-    db_session: Session = Depends(get_session),
-) -> ChatSeedResponse:
-
-    try:
-        new_chat_session = create_chat_session(
-            db_session=db_session,
-            description=chat_seed_request.description or "",
-            user_id=None,  # this chat session is "unassigned" until a user visits the web UI
-            persona_id=chat_seed_request.persona_id,
-            llm_override=chat_seed_request.llm_override,
-            prompt_override=chat_seed_request.prompt_override,
-        )
-    except Exception as e:
-        logger.exception(e)
-        raise HTTPException(status_code=400, detail="Invalid Persona provided.")
-
-    if chat_seed_request.message is not None:
-        root_message = get_or_create_root_message(
-            chat_session_id=new_chat_session.id, db_session=db_session
-        )
-        llm = get_llm_for_persona(
-            persona=new_chat_session.persona,
-            user=user,
-        )
-
-        tokenizer = get_tokenizer(
-            model_name=llm.config.model_name,
-            provider_type=llm.config.model_provider,
-        )
-        token_count = len(tokenizer.encode(chat_seed_request.message))
-
-        create_new_chat_message(
-            chat_session_id=new_chat_session.id,
-            parent_message=root_message,
-            message=chat_seed_request.message,
-            token_count=token_count,
-            message_type=MessageType.USER,
-            db_session=db_session,
-        )
-
-    return ChatSeedResponse(
-        redirect_url=f"{WEB_DOMAIN}/chat?chatId={new_chat_session.id}&seeded=true"
-    )
-
-
 class SeedChatFromSlackRequest(BaseModel):
    chat_session_id: UUID

--- a/backend/onyx/server/query_and_chat/models.py
+++ b/backend/onyx/server/query_and_chat/models.py
@@ -1,18 +1,15 @@
 from datetime import datetime
 from enum import Enum
 from typing import Any
-from typing import TYPE_CHECKING
 from uuid import UUID

 from pydantic import BaseModel
 from pydantic import model_validator

-from onyx.chat.models import PersonaOverrideConfig
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import SessionType
 from onyx.context.search.models import BaseFilters
-from onyx.context.search.models import ChunkContext
 from onyx.context.search.models import SavedSearchDoc
 from onyx.context.search.models import SearchDoc
 from onyx.context.search.models import Tag
@@ -20,7 +17,6 @@ from onyx.db.enums import ChatSessionSharedStatus
 from onyx.db.models import ChatSession
 from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import LLMOverride
-from onyx.llm.override_models import PromptOverride
 from onyx.server.query_and_chat.streaming_models import Packet


@@ -40,8 +36,9 @@ class MessageOrigin(str, Enum):
    UNSET = "unset"


-if TYPE_CHECKING:
-    pass
+class MessageResponseIDInfo(BaseModel):
+    user_message_id: int | None
+    reserved_assistant_message_id: int


 class SourceTag(Tag):
@@ -83,6 +80,8 @@ class ChatFeedbackRequest(BaseModel):
        return self


+# NOTE: This model is used for the core flow of the Onyx application, any changes to it should be reviewed and approved by an
+# experienced team member. It is very important to 1. avoid bloat and 2. that this remains backwards compatible across versions.
 class SendMessageRequest(BaseModel):
    message: str

@@ -141,115 +140,6 @@ class SendMessageRequest(BaseModel):
        return self


-class OptionalSearchSetting(str, Enum):
-    ALWAYS = "always"
-    NEVER = "never"
-    # Determine whether to run search based on history and latest query
-    AUTO = "auto"
-
-
-class RetrievalDetails(ChunkContext):
-    # Use LLM to determine whether to do a retrieval or only rely on existing history
-    # If the Persona is configured to not run search (0 chunks), this is bypassed
-    # If no Prompt is configured, the only search results are shown, this is bypassed
-    run_search: OptionalSearchSetting = OptionalSearchSetting.AUTO
-    # Is this a real-time/streaming call or a question where Onyx can take more time?
-    # Used to determine reranking flow
-    real_time: bool = True
-    # The following have defaults in the Persona settings which can be overridden via
-    # the query, if None, then use Persona settings
-    filters: BaseFilters | None = None
-    enable_auto_detect_filters: bool | None = None
-    # if None, no offset / limit
-    offset: int | None = None
-    limit: int | None = None
-
-    # If this is set, only the highest matching chunk (or merged chunks) is returned
-    dedupe_docs: bool = False
-
-
-class CreateChatMessageRequest(ChunkContext):
-    """Before creating messages, be sure to create a chat_session and get an id"""
-
-    chat_session_id: UUID
-    # This is the primary-key (unique identifier) for the previous message of the tree
-    parent_message_id: int | None
-
-    # New message contents
-    message: str
-    # Files that we should attach to this message
-    file_descriptors: list[FileDescriptor] = []
-    # Prompts are embedded in personas, so no separate prompt_id needed
-    # If search_doc_ids provided, it should use those docs explicitly
-    search_doc_ids: list[int] | None
-    retrieval_options: RetrievalDetails | None
-    # allows the caller to specify the exact search query they want to use
-    # will disable Query Rewording if specified
-    query_override: str | None = None
-
-    # enables additional handling to ensure that we regenerate with a given user message ID
-    regenerate: bool | None = None
-
-    # allows the caller to override the Persona / Prompt
-    # these do not persist in the chat thread details
-    llm_override: LLMOverride | None = None
-    # Test-only override for deterministic LiteLLM mock responses.
-    mock_llm_response: str | None = None
-    prompt_override: PromptOverride | None = None
-
-    # Allows the caller to override the temperature for the chat session
-    # this does persist in the chat thread details
-    temperature_override: float | None = None
-
-    # allow user to specify an alternate assistant
-    alternate_assistant_id: int | None = None
-
-    # This takes the priority over the prompt_override
-    # This won't be a type that's passed in directly from the API
-    persona_override_config: PersonaOverrideConfig | None = None
-
-    # used for seeded chats to kick off the generation of an AI answer
-    use_existing_user_message: bool = False
-
-    # used for "OpenAI Assistants API"
-    existing_assistant_message_id: int | None = None
-
-    # forces the LLM to return a structured response, see
-    # https://platform.openai.com/docs/guides/structured-outputs/introduction
-    structured_response_format: dict | None = None
-
-    skip_gen_ai_answer_generation: bool = False
-
-    # List of allowed tool IDs to restrict tool usage. If not provided, all tools available to the persona will be used.
-    allowed_tool_ids: list[int] | None = None
-
-    # List of tool IDs we MUST use.
-    # TODO: make this a single one since unclear how to force this for multiple at a time.
-    forced_tool_ids: list[int] | None = None
-
-    deep_research: bool = False
-
-    # When True (default), enables citation generation with markers and CitationInfo packets
-    # When False, disables citations: removes markers like [1], [2] and skips CitationInfo packets
-    include_citations: bool = True
-
-    # Origin of the message for telemetry tracking
-    origin: MessageOrigin = MessageOrigin.UNKNOWN
-
-    @model_validator(mode="after")
-    def check_search_doc_ids_or_retrieval_options(self) -> "CreateChatMessageRequest":
-        if self.search_doc_ids is None and self.retrieval_options is None:
-            raise ValueError(
-                "Either search_doc_ids or retrieval_options must be provided, but not both or neither."
-            )
-        return self
-
-    def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
-        data = super().model_dump(*args, **kwargs)
-        data["chat_session_id"] = str(data["chat_session_id"])
-        return data
-
-
 class ChatMessageIdentifier(BaseModel):
    message_id: int

@@ -365,13 +255,3 @@ class ChatSearchResponse(BaseModel):
    groups: list[ChatSessionGroup]
    has_more: bool
    next_page: int | None = None
-
-
-class ChatSearchRequest(BaseModel):
-    query: str | None = None
-    page: int = 1
-    page_size: int = 10
-
-
-class CreateChatResponse(BaseModel):
-    chat_session_id: str
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -17,11 +17,12 @@ disallow_untyped_defs = true
 warn_unused_ignores = true
 enable_error_code = ["possibly-undefined"]
 strict_equality = true
+# Patterns match paths whether mypy is run from backend/ (CI) or repo root (e.g. VS Code extension with target ./backend)
 exclude = [
-  "^generated/.*",
-  "^\\.venv/",
-  "^onyx/server/features/build/sandbox/kubernetes/docker/skills/pptx/",
-  "^onyx/server/features/build/sandbox/kubernetes/docker/templates/venv/",
+  "(?:^|/)generated/",
+  "(?:^|/)\\.venv/",
+  "(?:^|/)onyx/server/features/build/sandbox/kubernetes/docker/skills/",
+  "(?:^|/)onyx/server/features/build/sandbox/kubernetes/docker/templates/",
 ]

 [[tool.mypy.overrides]]
--- a/backend/scripts/api_inference_sample.py
+++ b/backend/scripts/api_inference_sample.py
@@ -23,7 +23,7 @@ def create_new_chat_session(onyx_url: str, api_key: str | None) -> int:


 def process_question(onyx_url: str, question: str, api_key: str | None) -> None:
-    message_endpoint = onyx_url + "/api/chat/send-message"
+    message_endpoint = onyx_url + "/api/chat/send-chat-message"

    chat_session_id = create_new_chat_session(onyx_url, api_key)

--- a/backend/scripts/chat_loadtest.py
+++ b/backend/scripts/chat_loadtest.py
@@ -88,7 +88,7 @@ class ChatLoadTester:
        token_count = 0

        async with session.post(
-            f"{self.base_url}/chat/send-message",
+            f"{self.base_url}/chat/send-chat-message",
            headers=self.headers,
            json={
                "chat_session_id": chat_session_id,
--- a/backend/tests/external_dependency_unit/answer/stream_test_assertions.py
+++ b/backend/tests/external_dependency_unit/answer/stream_test_assertions.py
@@ -4,8 +4,8 @@ from typing import cast

 from onyx.chat.models import AnswerStreamPart
 from onyx.chat.models import CreateChatSessionID
-from onyx.chat.models import MessageResponseIDInfo
 from onyx.context.search.models import SearchDoc
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
 from onyx.server.query_and_chat.streaming_models import ImageGenerationFinal
 from onyx.server.query_and_chat.streaming_models import OpenUrlDocuments
--- a/backend/tests/external_dependency_unit/answer/test_answer_without_openai.py
+++ b/backend/tests/external_dependency_unit/answer/test_answer_without_openai.py
@@ -6,9 +6,8 @@ from uuid import uuid4
 from sqlalchemy.orm import Session

 from onyx.chat.models import AnswerStreamPart
-from onyx.chat.models import MessageResponseIDInfo
 from onyx.chat.models import StreamingError
-from onyx.chat.process_message import stream_chat_message_objects
+from onyx.chat.process_message import handle_stream_message_objects
 from onyx.db.chat import create_chat_session
 from onyx.db.enums import LLMModelFlowType
 from onyx.db.llm import fetch_existing_llm_providers
@@ -18,8 +17,8 @@ from onyx.db.llm import upsert_llm_provider
 from onyx.llm.constants import LlmProviderNames
 from onyx.server.manage.llm.models import LLMProviderUpsertRequest
 from onyx.server.manage.llm.models import ModelConfigurationUpsertRequest
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import RetrievalDetails
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
+from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
 from onyx.server.query_and_chat.streaming_models import Packet
@@ -70,17 +69,13 @@ def test_answer_with_only_anthropic_provider(
            persona_id=0,
        )

-        chat_request = CreateChatMessageRequest(
-            chat_session_id=chat_session.id,
-            parent_message_id=None,
+        chat_request = SendMessageRequest(
            message="hello",
-            file_descriptors=[],
-            search_doc_ids=None,
-            retrieval_options=RetrievalDetails(),
+            chat_session_id=chat_session.id,
        )

        response_stream: list[AnswerStreamPart] = []
-        for packet in stream_chat_message_objects(
+        for packet in handle_stream_message_objects(
            new_msg_req=chat_request,
            user=test_user,
            db_session=db_session,
--- a/backend/tests/external_dependency_unit/answer/test_current_datetime_replacement.py
+++ b/backend/tests/external_dependency_unit/answer/test_current_datetime_replacement.py
@@ -4,14 +4,13 @@ from datetime import datetime
 from sqlalchemy.orm import Session

 from onyx.chat.models import AnswerStreamPart
-from onyx.chat.models import MessageResponseIDInfo
 from onyx.chat.models import StreamingError
-from onyx.chat.process_message import stream_chat_message_objects
+from onyx.chat.process_message import handle_stream_message_objects
 from onyx.db.chat import create_chat_session
 from onyx.db.models import User
 from onyx.db.persona import get_persona_by_id
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import RetrievalDetails
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
+from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from tests.external_dependency_unit.answer.conftest import ensure_default_llm_provider
 from tests.external_dependency_unit.conftest import create_test_user
@@ -42,18 +41,12 @@ def test_stream_chat_current_date_response(
        persona_id=default_persona.id,
    )

-    chat_request = CreateChatMessageRequest(
-        chat_session_id=chat_session.id,
-        parent_message_id=None,
+    chat_request = SendMessageRequest(
        message="Please respond only with the current date in the format 'Weekday Month DD, YYYY'.",
-        file_descriptors=[],
-        prompt_override=None,
-        search_doc_ids=None,
-        retrieval_options=RetrievalDetails(),
-        query_override=None,
+        chat_session_id=chat_session.id,
    )

-    gen = stream_chat_message_objects(
+    gen = handle_stream_message_objects(
        new_msg_req=chat_request,
        user=test_user,
        db_session=db_session,
--- a/backend/tests/external_dependency_unit/answer/test_stream_chat_message.py
+++ b/backend/tests/external_dependency_unit/answer/test_stream_chat_message.py
@@ -7,8 +7,8 @@ import pytest
 from sqlalchemy.orm import Session

 from onyx.chat.models import CreateChatSessionID
-from onyx.chat.models import MessageResponseIDInfo
 from onyx.configs.constants import DocumentSource
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
 from onyx.server.query_and_chat.streaming_models import GeneratedImage
 from onyx.server.query_and_chat.streaming_models import ImageGenerationFinal
--- a/backend/tests/external_dependency_unit/answer/test_stream_chat_message_objects.py
+++ b/backend/tests/external_dependency_unit/answer/test_stream_chat_message_objects.py
@@ -6,15 +6,14 @@ import pytest
 from sqlalchemy.orm import Session

 from onyx.chat.models import AnswerStreamPart
-from onyx.chat.models import MessageResponseIDInfo
 from onyx.chat.models import StreamingError
-from onyx.chat.process_message import stream_chat_message_objects
+from onyx.chat.process_message import handle_stream_message_objects
 from onyx.db.chat import create_chat_session
 from onyx.db.models import RecencyBiasSetting
 from onyx.db.models import User
 from onyx.db.persona import upsert_persona
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import RetrievalDetails
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
+from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import Packet
 from tests.external_dependency_unit.answer.conftest import ensure_default_llm_provider
@@ -100,18 +99,12 @@ def test_stream_chat_message_objects_without_web_search(
        persona_id=test_persona.id,
    )
    # Create the chat message request with a query that attempts to force web search
-    chat_request = CreateChatMessageRequest(
-        chat_session_id=chat_session.id,
-        parent_message_id=None,
+    chat_request = SendMessageRequest(
        message="run a web search for 'Onyx'",
-        file_descriptors=[],
-        prompt_override=None,
-        search_doc_ids=None,
-        retrieval_options=RetrievalDetails(),
-        query_override=None,
+        chat_session_id=chat_session.id,
    )
-    # Call stream_chat_message_objects
-    response_generator = stream_chat_message_objects(
+    # Call handle_stream_message_objects
+    response_generator = handle_stream_message_objects(
        new_msg_req=chat_request,
        user=test_user,
        db_session=db_session,
--- a/backend/tests/external_dependency_unit/llm/test_llm_provider_called.py
+++ b/backend/tests/external_dependency_unit/llm/test_llm_provider_called.py
@@ -8,7 +8,6 @@ import pytest
 from fastapi_users.password import PasswordHelper
 from sqlalchemy.orm import Session

-from onyx.chat.models import MessageResponseIDInfo
 from onyx.db.llm import fetch_existing_llm_provider
 from onyx.db.llm import remove_llm_provider
 from onyx.db.llm import update_default_provider
@@ -21,6 +20,7 @@ from onyx.server.manage.llm.models import LLMProviderUpsertRequest
 from onyx.server.manage.llm.models import ModelConfigurationUpsertRequest
 from onyx.server.query_and_chat.chat_backend import create_new_chat_session
 from onyx.server.query_and_chat.models import ChatSessionCreationRequest
+from onyx.server.query_and_chat.models import MessageResponseIDInfo
 from tests.external_dependency_unit.answer.stream_test_assertions import (
    assert_answer_stream_part_correct,
 )
--- a/backend/tests/integration/common_utils/chat.py
+++ b/backend/tests/integration/common_utils/chat.py
@@ -29,7 +29,7 @@ def test_create_chat_session_and_send_messages() -> None:
    # Send first message
    first_message = "Hello, this is a test message."
    send_message_response = requests.post(
-        f"{base_url}/chat/send-message",
+        f"{base_url}/chat/send-chat-message",
        json={
            "chat_session_id": chat_session_id,
            "message": first_message,
@@ -43,7 +43,7 @@ def test_create_chat_session_and_send_messages() -> None:
    # Send second message
    second_message = "Can you provide more information?"
    send_message_response = requests.post(
-        f"{base_url}/chat/send-message",
+        f"{base_url}/chat/send-chat-message",
        json={
            "chat_session_id": chat_session_id,
            "message": second_message,
--- a/backend/tests/integration/common_utils/managers/chat.py
+++ b/backend/tests/integration/common_utils/managers/chat.py
@@ -12,10 +12,9 @@ from onyx.context.search.models import SavedSearchDoc
 from onyx.context.search.models import SearchDoc
 from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import LLMOverride
-from onyx.llm.override_models import PromptOverride
+from onyx.server.query_and_chat.models import AUTO_PLACE_AFTER_LATEST_MESSAGE
 from onyx.server.query_and_chat.models import ChatSessionCreationRequest
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import RetrievalDetails
+from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import StreamingType
 from tests.integration.common_utils.constants import API_SERVER_URL
 from tests.integration.common_utils.constants import GENERAL_HEADERS
@@ -104,37 +103,27 @@ class ChatSessionManager:
        parent_message_id: int | None = None,
        user_performing_action: DATestUser | None = None,
        file_descriptors: list[FileDescriptor] | None = None,
-        search_doc_ids: list[int] | None = None,
-        retrieval_options: RetrievalDetails | None = None,
-        query_override: str | None = None,
-        regenerate: bool | None = None,
-        llm_override: LLMOverride | None = None,
-        prompt_override: PromptOverride | None = None,
-        alternate_assistant_id: int | None = None,
-        use_existing_user_message: bool = False,
        allowed_tool_ids: list[int] | None = None,
        forced_tool_ids: list[int] | None = None,
        chat_session: DATestChatSession | None = None,
        mock_llm_response: str | None = None,
        deep_research: bool = False,
+        llm_override: LLMOverride | None = None,
    ) -> StreamedResponse:
-        chat_message_req = CreateChatMessageRequest(
-            chat_session_id=chat_session_id,
-            parent_message_id=parent_message_id,
+        chat_message_req = SendMessageRequest(
            message=message,
+            chat_session_id=chat_session_id,
+            parent_message_id=(
+                parent_message_id
+                if parent_message_id is not None
+                else AUTO_PLACE_AFTER_LATEST_MESSAGE
+            ),
            file_descriptors=file_descriptors or [],
-            search_doc_ids=search_doc_ids or [],
-            retrieval_options=retrieval_options,
-            query_override=query_override,
-            regenerate=regenerate,
-            llm_override=llm_override,
-            mock_llm_response=mock_llm_response,
-            prompt_override=prompt_override,
-            alternate_assistant_id=alternate_assistant_id,
-            use_existing_user_message=use_existing_user_message,
            allowed_tool_ids=allowed_tool_ids,
-            forced_tool_ids=forced_tool_ids,
+            forced_tool_id=forced_tool_ids[0] if forced_tool_ids else None,
+            mock_llm_response=mock_llm_response,
            deep_research=deep_research,
+            llm_override=llm_override,
        )

        headers = (
@@ -145,8 +134,8 @@ class ChatSessionManager:
        cookies = user_performing_action.cookies if user_performing_action else None

        response = requests.post(
-            f"{API_SERVER_URL}/chat/send-message",
-            json=chat_message_req.model_dump(),
+            f"{API_SERVER_URL}/chat/send-chat-message",
+            json=chat_message_req.model_dump(mode="json"),
            headers=headers,
            stream=True,
            cookies=cookies,
@@ -182,17 +171,11 @@ class ChatSessionManager:
        parent_message_id: int | None = None,
        user_performing_action: DATestUser | None = None,
        file_descriptors: list[FileDescriptor] | None = None,
-        search_doc_ids: list[int] | None = None,
-        query_override: str | None = None,
-        regenerate: bool | None = None,
-        llm_override: LLMOverride | None = None,
-        prompt_override: PromptOverride | None = None,
-        alternate_assistant_id: int | None = None,
-        use_existing_user_message: bool = False,
        allowed_tool_ids: list[int] | None = None,
        forced_tool_ids: list[int] | None = None,
        mock_llm_response: str | None = None,
        deep_research: bool = False,
+        llm_override: LLMOverride | None = None,
    ) -> None:
        """
        Send a message and simulate client disconnect before stream completes.
@@ -204,33 +187,25 @@ class ChatSessionManager:
            chat_session_id: The chat session ID
            message: The message to send
            disconnect_after_packets: Disconnect after receiving this many packets.
-                If None, disconnect_after_type must be specified.
-            disconnect_after_type: Disconnect after receiving a packet of this type
-                (e.g., "message_start", "search_tool_start"). If None,
-                disconnect_after_packets must be specified.
            ... (other standard message parameters)

        Returns:
-            StreamedResponse containing data received before disconnect,
-            with is_disconnected=True flag set.
+            None. Caller can verify server-side cleanup via get_chat_history etc.
        """
-        chat_message_req = CreateChatMessageRequest(
-            chat_session_id=chat_session_id,
-            parent_message_id=parent_message_id,
+        chat_message_req = SendMessageRequest(
            message=message,
+            chat_session_id=chat_session_id,
+            parent_message_id=(
+                parent_message_id
+                if parent_message_id is not None
+                else AUTO_PLACE_AFTER_LATEST_MESSAGE
+            ),
            file_descriptors=file_descriptors or [],
-            search_doc_ids=search_doc_ids or [],
-            retrieval_options=RetrievalDetails(),  # This will be deprecated soon anyway
-            query_override=query_override,
-            regenerate=regenerate,
-            llm_override=llm_override,
-            mock_llm_response=mock_llm_response,
-            prompt_override=prompt_override,
-            alternate_assistant_id=alternate_assistant_id,
-            use_existing_user_message=use_existing_user_message,
            allowed_tool_ids=allowed_tool_ids,
-            forced_tool_ids=forced_tool_ids,
+            forced_tool_id=forced_tool_ids[0] if forced_tool_ids else None,
+            mock_llm_response=mock_llm_response,
            deep_research=deep_research,
+            llm_override=llm_override,
        )

        headers = (
@@ -243,8 +218,8 @@ class ChatSessionManager:
        packets_received = 0

        with requests.post(
-            f"{API_SERVER_URL}/chat/send-message",
-            json=chat_message_req.model_dump(),
+            f"{API_SERVER_URL}/chat/send-chat-message",
+            json=chat_message_req.model_dump(mode="json"),
            headers=headers,
            stream=True,
            cookies=cookies,
--- a/backend/tests/integration/tests/llm_workflows/test_tool_policy_enforcement.py
+++ b/backend/tests/integration/tests/llm_workflows/test_tool_policy_enforcement.py
@@ -1,7 +1,5 @@
 from onyx.configs import app_configs
 from onyx.configs.constants import DocumentSource
-from onyx.server.query_and_chat.models import OptionalSearchSetting
-from onyx.server.query_and_chat.models import RetrievalDetails
 from onyx.tools.constants import SEARCH_TOOL_ID
 from tests.integration.common_utils.managers.cc_pair import CCPairManager
 from tests.integration.common_utils.managers.chat import ChatSessionManager
@@ -172,7 +170,7 @@ def test_run_search_always_maps_to_forced_search_tool(admin_user: DATestUser) ->
        chat_session_id=chat_session.id,
        message="always run search",
        user_performing_action=admin_user,
-        retrieval_options=RetrievalDetails(run_search=OptionalSearchSetting.ALWAYS),
+        forced_tool_ids=[search_tool_id],
        mock_llm_response='{"name":"internal_search","arguments":{"queries":["gamma"]}}',
    )

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -196,7 +196,7 @@ members = ["backend", "tools/ods"]

 [tool.basedpyright]
 include = ["backend"]
-exclude = ["backend/generated"]
+exclude = ["backend/generated", "backend/onyx/server/features/build/sandbox/kubernetes/docker/skills/pptx", "backend/onyx/server/features/build/sandbox/kubernetes/docker/templates/venv"]
 typeCheckingMode = "off"

 [tool.ruff]