StreamWriter w/o default values

code for timing langgraph basic search
push various minor updates
2026-03-01 13:45:44 +00:00 · 2025-02-04 16:18:01 -08:00 · 2025-02-04 13:01:06 -08:00 · 2025-02-03 21:23:45 -08:00 · 2025-02-03 20:49:45 -08:00 · 2025-02-03 20:10:51 -08:00
151 changed files with 12312 additions and 1034 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@
 .vscode/
 *.sw?
 /backend/tests/regression/answer_quality/search_test_config.yaml
-/web/test-results/
+/web/test-results/
+backend/onyx/agent_search/main/test_data.json
+backend/tests/regression/answer_quality/test_data.json
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -52,3 +52,9 @@ BING_API_KEY=<REPLACE THIS>
 # Enable the full set of Danswer Enterprise Edition features
 # NOTE: DO NOT ENABLE THIS UNLESS YOU HAVE A PAID ENTERPRISE LICENSE (or if you are using this for local testing/development)
 ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=False
+
+# Agent Search configs  # TODO: Remove give proper namings
+AGENT_RETRIEVAL_STATS=False   # Note: This setting will incur substantial re-ranking effort
+AGENT_RERANKING_STATS=True
+AGENT_MAX_QUERY_RETRIEVAL_RESULTS=20
+AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS=20
--- a/backend/alembic/versions/98a5008d8711_agent_tracking.py
+++ b/backend/alembic/versions/98a5008d8711_agent_tracking.py
@@ -0,0 +1,107 @@
+"""agent_tracking
+
+Revision ID: 98a5008d8711
+Revises: 2f80c6a2550f
+Create Date: 2025-01-29 17:00:00.000001
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.dialects.postgresql import UUID
+
+# revision identifiers, used by Alembic.
+revision = "98a5008d8711"
+down_revision = "2f80c6a2550f"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "agent__search_metrics",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("persona_id", sa.Integer(), nullable=True),
+        sa.Column("agent_type", sa.String(), nullable=False),
+        sa.Column("start_time", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("base_duration_s", sa.Float(), nullable=False),
+        sa.Column("full_duration_s", sa.Float(), nullable=False),
+        sa.Column("base_metrics", postgresql.JSONB(), nullable=True),
+        sa.Column("refined_metrics", postgresql.JSONB(), nullable=True),
+        sa.Column("all_metrics", postgresql.JSONB(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["persona_id"],
+            ["persona.id"],
+        ),
+        sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create sub_question table
+    op.create_table(
+        "agent__sub_question",
+        sa.Column("id", sa.Integer, primary_key=True),
+        sa.Column("primary_question_id", sa.Integer, sa.ForeignKey("chat_message.id")),
+        sa.Column(
+            "chat_session_id", UUID(as_uuid=True), sa.ForeignKey("chat_session.id")
+        ),
+        sa.Column("sub_question", sa.Text),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.func.now()
+        ),
+        sa.Column("sub_answer", sa.Text),
+        sa.Column("sub_question_doc_results", postgresql.JSONB(), nullable=True),
+        sa.Column("level", sa.Integer(), nullable=False),
+        sa.Column("level_question_num", sa.Integer(), nullable=False),
+    )
+
+    # Create sub_query table
+    op.create_table(
+        "agent__sub_query",
+        sa.Column("id", sa.Integer, primary_key=True),
+        sa.Column(
+            "parent_question_id", sa.Integer, sa.ForeignKey("agent__sub_question.id")
+        ),
+        sa.Column(
+            "chat_session_id", UUID(as_uuid=True), sa.ForeignKey("chat_session.id")
+        ),
+        sa.Column("sub_query", sa.Text),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.func.now()
+        ),
+    )
+
+    # Create sub_query__search_doc association table
+    op.create_table(
+        "agent__sub_query__search_doc",
+        sa.Column(
+            "sub_query_id",
+            sa.Integer,
+            sa.ForeignKey("agent__sub_query.id"),
+            primary_key=True,
+        ),
+        sa.Column(
+            "search_doc_id",
+            sa.Integer,
+            sa.ForeignKey("search_doc.id"),
+            primary_key=True,
+        ),
+    )
+
+    op.add_column(
+        "chat_message",
+        sa.Column(
+            "refined_answer_improvement",
+            sa.Boolean(),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("chat_message", "refined_answer_improvement")
+    op.drop_table("agent__sub_query__search_doc")
+    op.drop_table("agent__sub_query")
+    op.drop_table("agent__sub_question")
+    op.drop_table("agent__search_metrics")
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -179,6 +179,7 @@ def handle_simplified_chat_message(
        chunks_below=0,
        full_doc=chat_message_req.full_doc,
        structured_response_format=chat_message_req.structured_response_format,
+        use_agentic_search=chat_message_req.use_agentic_search,
    )

    packets = stream_chat_message_objects(
@@ -301,6 +302,7 @@ def handle_send_message_simple_with_history(
        chunks_below=0,
        full_doc=req.full_doc,
        structured_response_format=req.structured_response_format,
+        use_agentic_search=req.use_agentic_search,
    )

    packets = stream_chat_message_objects(
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -57,6 +57,9 @@ class BasicCreateChatMessageRequest(ChunkContext):
    # https://platform.openai.com/docs/guides/structured-outputs/introduction
    structured_response_format: dict | None = None

+    # If True, uses agentic search instead of basic search
+    use_agentic_search: bool = False
+

 class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
    # Last element is the new query. All previous elements are historical context
@@ -71,6 +74,8 @@ class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
    # only works if using an OpenAI model. See the following for more details:
    # https://platform.openai.com/docs/guides/structured-outputs/introduction
    structured_response_format: dict | None = None
+    # If True, uses agentic search instead of basic search
+    use_agentic_search: bool = False


 class SimpleDoc(BaseModel):
@@ -120,9 +125,12 @@ class OneShotQARequest(ChunkContext):
    # will also disable Thread-based Rewording if specified
    query_override: str | None = None

-    # If True, skips generative an AI response to the search query
+    # If True, skips generating an AI response to the search query
    skip_gen_ai_answer_generation: bool = False

+    # If True, uses agentic search instead of basic search
+    use_agentic_search: bool = False
+
    @model_validator(mode="after")
    def check_persona_fields(self) -> "OneShotQARequest":
        if self.persona_override_config is None and self.persona_id is None:
--- a/backend/ee/onyx/server/query_and_chat/query_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/query_backend.py
@@ -196,6 +196,8 @@ def get_answer_stream(
        retrieval_details=query_request.retrieval_options,
        rerank_settings=query_request.rerank_settings,
        db_session=db_session,
+        use_agentic_search=query_request.use_agentic_search,
+        skip_gen_ai_answer_generation=query_request.skip_gen_ai_answer_generation,
    )

    packets = stream_chat_message_objects(
--- a/backend/onyx/agents/agent_search/basic/graph_builder.py
+++ b/backend/onyx/agents/agent_search/basic/graph_builder.py
@@ -0,0 +1,97 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.basic.states import BasicInput
+from onyx.agents.agent_search.basic.states import BasicOutput
+from onyx.agents.agent_search.basic.states import BasicState
+from onyx.agents.agent_search.orchestration.nodes.basic_use_tool_response import (
+    basic_use_tool_response,
+)
+from onyx.agents.agent_search.orchestration.nodes.llm_tool_choice import llm_tool_choice
+from onyx.agents.agent_search.orchestration.nodes.prepare_tool_input import (
+    prepare_tool_input,
+)
+from onyx.agents.agent_search.orchestration.nodes.tool_call import tool_call
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def basic_graph_builder() -> StateGraph:
+    graph = StateGraph(
+        state_schema=BasicState,
+        input=BasicInput,
+        output=BasicOutput,
+    )
+
+    ### Add nodes ###
+
+    graph.add_node(
+        node="prepare_tool_input",
+        action=prepare_tool_input,
+    )
+
+    graph.add_node(
+        node="llm_tool_choice",
+        action=llm_tool_choice,
+    )
+
+    graph.add_node(
+        node="tool_call",
+        action=tool_call,
+    )
+
+    graph.add_node(
+        node="basic_use_tool_response",
+        action=basic_use_tool_response,
+    )
+
+    ### Add edges ###
+
+    graph.add_edge(start_key=START, end_key="prepare_tool_input")
+
+    graph.add_edge(start_key="prepare_tool_input", end_key="llm_tool_choice")
+
+    graph.add_conditional_edges("llm_tool_choice", should_continue, ["tool_call", END])
+
+    graph.add_edge(
+        start_key="tool_call",
+        end_key="basic_use_tool_response",
+    )
+
+    graph.add_edge(
+        start_key="basic_use_tool_response",
+        end_key=END,
+    )
+
+    return graph
+
+
+def should_continue(state: BasicState) -> str:
+    return (
+        # If there are no tool calls, basic graph already streamed the answer
+        END
+        if state.tool_choice is None
+        else "tool_call"
+    )
+
+
+if __name__ == "__main__":
+    from onyx.db.engine import get_session_context_manager
+    from onyx.context.search.models import SearchRequest
+    from onyx.llm.factory import get_default_llms
+    from onyx.agents.agent_search.shared_graph_utils.utils import get_test_config
+
+    graph = basic_graph_builder()
+    compiled_graph = graph.compile()
+    input = BasicInput(_unused=True)
+    primary_llm, fast_llm = get_default_llms()
+    with get_session_context_manager() as db_session:
+        config, _ = get_test_config(
+            db_session=db_session,
+            primary_llm=primary_llm,
+            fast_llm=fast_llm,
+            search_request=SearchRequest(query="How does onyx use FastAPI?"),
+        )
+        compiled_graph.invoke(input, config={"metadata": {"config": config}})
--- a/backend/onyx/agents/agent_search/basic/states.py
+++ b/backend/onyx/agents/agent_search/basic/states.py
@@ -0,0 +1,35 @@
+from typing import TypedDict
+
+from langchain_core.messages import AIMessageChunk
+from pydantic import BaseModel
+
+from onyx.agents.agent_search.orchestration.states import ToolCallUpdate
+from onyx.agents.agent_search.orchestration.states import ToolChoiceInput
+from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
+
+# States contain values that change over the course of graph execution,
+# Config is for values that are set at the start and never change.
+# If you are using a value from the config and realize it needs to change,
+# you should add it to the state and use/update the version in the state.
+
+
+## Graph Input State
+class BasicInput(BaseModel):
+    # Langgraph needs a nonempty input, but we pass in all static
+    # data through a RunnableConfig.
+    _unused: bool = True
+
+
+## Graph Output State
+class BasicOutput(TypedDict):
+    tool_call_chunk: AIMessageChunk
+
+
+## Graph State
+class BasicState(
+    BasicInput,
+    ToolChoiceInput,
+    ToolCallUpdate,
+    ToolChoiceUpdate,
+):
+    pass
--- a/backend/onyx/agents/agent_search/basic/utils.py
+++ b/backend/onyx/agents/agent_search/basic/utils.py
@@ -0,0 +1,63 @@
+from collections.abc import Iterator
+from typing import cast
+
+from langchain_core.messages import AIMessageChunk
+from langchain_core.messages import BaseMessage
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import LlmDoc
+from onyx.chat.stream_processing.answer_response_handler import AnswerResponseHandler
+from onyx.chat.stream_processing.answer_response_handler import CitationResponseHandler
+from onyx.chat.stream_processing.answer_response_handler import (
+    PassThroughAnswerResponseHandler,
+)
+from onyx.chat.stream_processing.utils import map_document_id_order
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def process_llm_stream(
+    messages: Iterator[BaseMessage],
+    should_stream_answer: bool,
+    writer: StreamWriter,
+    final_search_results: list[LlmDoc] | None = None,
+    displayed_search_results: list[LlmDoc] | None = None,
+) -> AIMessageChunk:
+    tool_call_chunk = AIMessageChunk(content="")
+
+    if final_search_results and displayed_search_results:
+        answer_handler: AnswerResponseHandler = CitationResponseHandler(
+            context_docs=final_search_results,
+            final_doc_id_to_rank_map=map_document_id_order(final_search_results),
+            display_doc_id_to_rank_map=map_document_id_order(displayed_search_results),
+        )
+    else:
+        answer_handler = PassThroughAnswerResponseHandler()
+
+    full_answer = ""
+    # This stream will be the llm answer if no tool is chosen. When a tool is chosen,
+    # the stream will contain AIMessageChunks with tool call information.
+    for message in messages:
+        answer_piece = message.content
+        if not isinstance(answer_piece, str):
+            # this is only used for logging, so fine to
+            # just add the string representation
+            answer_piece = str(answer_piece)
+        full_answer += answer_piece
+
+        if isinstance(message, AIMessageChunk) and (
+            message.tool_call_chunks or message.tool_calls
+        ):
+            tool_call_chunk += message  # type: ignore
+        elif should_stream_answer:
+            for response_part in answer_handler.handle_response_part(message, []):
+                write_custom_event(
+                    "basic_response",
+                    response_part,
+                    writer,
+                )
+
+    logger.debug(f"Full answer: {full_answer}")
+    return cast(AIMessageChunk, tool_call_chunk)
--- a/backend/onyx/agents/agent_search/core_state.py
+++ b/backend/onyx/agents/agent_search/core_state.py
@@ -0,0 +1,21 @@
+from operator import add
+from typing import Annotated
+
+from pydantic import BaseModel
+
+
+class CoreState(BaseModel):
+    """
+    This is the core state that is shared across all subgraphs.
+    """
+
+    base_question: str = ""
+    log_messages: Annotated[list[str], add] = []
+
+
+class SubgraphCoreState(BaseModel):
+    """
+    This is the core state that is shared across all subgraphs.
+    """
+
+    log_messages: Annotated[list[str], add]
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/edges.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/edges.py
@@ -0,0 +1,31 @@
+from collections.abc import Hashable
+from datetime import datetime
+
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalInput,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def send_to_expanded_retrieval(state: SubQuestionAnsweringInput) -> Send | Hashable:
+    """
+    LangGraph edge to send a sub-question to the expanded retrieval.
+    """
+    edge_start_time = datetime.now()
+
+    return Send(
+        "initial_sub_question_expanded_retrieval",
+        ExpandedRetrievalInput(
+            question=state.question,
+            base_search=False,
+            sub_question_id=state.question_id,
+            log_messages=[f"{edge_start_time} -- Sending to expanded retrieval"],
+        ),
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/graph_builder.py
@@ -0,0 +1,137 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.edges import (
+    send_to_expanded_retrieval,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.check_sub_answer import (
+    check_sub_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.format_sub_answer import (
+    format_sub_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.generate_sub_answer import (
+    generate_sub_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.ingest_retrieved_documents import (
+    ingest_retrieved_documents,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionState,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.graph_builder import (
+    expanded_retrieval_graph_builder,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import get_test_config
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def answer_query_graph_builder() -> StateGraph:
+    """
+    LangGraph sub-graph builder for the initial individual sub-answer generation.
+    """
+    graph = StateGraph(
+        state_schema=AnswerQuestionState,
+        input=SubQuestionAnsweringInput,
+        output=AnswerQuestionOutput,
+    )
+
+    ### Add nodes ###
+
+    # The sub-graph that executes the expanded retrieval process for a sub-question
+    expanded_retrieval = expanded_retrieval_graph_builder().compile()
+    graph.add_node(
+        node="initial_sub_question_expanded_retrieval",
+        action=expanded_retrieval,
+    )
+
+    # The node that ingests the retrieved documents and puts them into the proper
+    # state keys.
+    graph.add_node(
+        node="ingest_retrieval",
+        action=ingest_retrieved_documents,
+    )
+
+    # The node that generates the sub-answer
+    graph.add_node(
+        node="generate_sub_answer",
+        action=generate_sub_answer,
+    )
+
+    # The node that checks the sub-answer
+    graph.add_node(
+        node="answer_check",
+        action=check_sub_answer,
+    )
+
+    # The node that formats the sub-answer for the following initial answer generation
+    graph.add_node(
+        node="format_answer",
+        action=format_sub_answer,
+    )
+
+    ### Add edges ###
+
+    graph.add_conditional_edges(
+        source=START,
+        path=send_to_expanded_retrieval,
+        path_map=["initial_sub_question_expanded_retrieval"],
+    )
+    graph.add_edge(
+        start_key="initial_sub_question_expanded_retrieval",
+        end_key="ingest_retrieval",
+    )
+    graph.add_edge(
+        start_key="ingest_retrieval",
+        end_key="generate_sub_answer",
+    )
+    graph.add_edge(
+        start_key="generate_sub_answer",
+        end_key="answer_check",
+    )
+    graph.add_edge(
+        start_key="answer_check",
+        end_key="format_answer",
+    )
+    graph.add_edge(
+        start_key="format_answer",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    from onyx.db.engine import get_session_context_manager
+    from onyx.llm.factory import get_default_llms
+    from onyx.context.search.models import SearchRequest
+
+    graph = answer_query_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+    search_request = SearchRequest(
+        query="what can you do with onyx or danswer?",
+    )
+    with get_session_context_manager() as db_session:
+        graph_config, search_tool = get_test_config(
+            db_session, primary_llm, fast_llm, search_request
+        )
+        inputs = SubQuestionAnsweringInput(
+            question="what can you do with onyx?",
+            question_id="0_0",
+            log_messages=[],
+        )
+        for thing in compiled_graph.stream(
+            input=inputs,
+            config={"configurable": {"config": graph_config}},
+        ):
+            logger.debug(thing)
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/check_sub_answer.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/check_sub_answer.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionState,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnswerCheckUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.prompts.agent_search import SUB_ANSWER_CHECK_PROMPT
+from onyx.prompts.agent_search import UNKNOWN_ANSWER
+
+
+def check_sub_answer(
+    state: AnswerQuestionState, config: RunnableConfig
+) -> SubQuestionAnswerCheckUpdate:
+    """
+    LangGraph node to check the quality of the sub-answer. The answer
+    is represented as a boolean value.
+    """
+    node_start_time = datetime.now()
+
+    level, question_num = parse_question_id(state.question_id)
+    if state.answer == UNKNOWN_ANSWER:
+        return SubQuestionAnswerCheckUpdate(
+            answer_quality=False,
+            log_messages=[
+                get_langgraph_node_log_string(
+                    graph_component="initial  - generate individual sub answer",
+                    node_name="check sub answer",
+                    node_start_time=node_start_time,
+                    result="unknown answer",
+                )
+            ],
+        )
+    msg = [
+        HumanMessage(
+            content=SUB_ANSWER_CHECK_PROMPT.format(
+                question=state.question,
+                base_answer=state.answer,
+            )
+        )
+    ]
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    fast_llm = graph_config.tooling.fast_llm
+    response = list(
+        fast_llm.stream(
+            prompt=msg,
+        )
+    )
+
+    quality_str: str = merge_message_runs(response, chunk_separator="")[0].content
+    answer_quality = "yes" in quality_str.lower()
+
+    return SubQuestionAnswerCheckUpdate(
+        answer_quality=answer_quality,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="initial  - generate individual sub answer",
+                node_name="check sub answer",
+                node_start_time=node_start_time,
+                result=f"Answer quality: {quality_str}",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/format_sub_answer.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/format_sub_answer.py
@@ -0,0 +1,30 @@
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionState,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    SubQuestionAnswerResults,
+)
+
+
+def format_sub_answer(state: AnswerQuestionState) -> AnswerQuestionOutput:
+    """
+    LangGraph node to generate the sub-answer format.
+    """
+    return AnswerQuestionOutput(
+        answer_results=[
+            SubQuestionAnswerResults(
+                question=state.question,
+                question_id=state.question_id,
+                verified_high_quality=state.answer_quality,
+                answer=state.answer,
+                sub_query_retrieval_results=state.expanded_retrieval_results,
+                verified_reranked_documents=state.verified_reranked_documents,
+                context_documents=state.context_documents,
+                cited_documents=state.cited_documents,
+                sub_question_retrieval_stats=state.sub_question_retrieval_stats,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/generate_sub_answer.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/generate_sub_answer.py
@@ -0,0 +1,137 @@
+from datetime import datetime
+from typing import Any
+from typing import cast
+
+from langchain_core.messages import merge_message_runs
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionState,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnswerGenerationUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    build_sub_question_answer_prompt,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import get_answer_citation_ids
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_persona_agent_prompt_expressions,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import AgentAnswerPiece
+from onyx.chat.models import StreamStopInfo
+from onyx.chat.models import StreamStopReason
+from onyx.chat.models import StreamType
+from onyx.configs.agent_configs import AGENT_MAX_ANSWER_CONTEXT_DOCS
+from onyx.prompts.agent_search import NO_RECOVERED_DOCS
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def generate_sub_answer(
+    state: AnswerQuestionState,
+    config: RunnableConfig,
+    writer: StreamWriter,
+) -> SubQuestionAnswerGenerationUpdate:
+    """
+    LangGraph node to generate a sub-answer.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = state.question
+    state.verified_reranked_documents
+    level, question_num = parse_question_id(state.question_id)
+    context_docs = state.context_documents[:AGENT_MAX_ANSWER_CONTEXT_DOCS]
+    persona_contextualized_prompt = get_persona_agent_prompt_expressions(
+        graph_config.inputs.search_request.persona
+    ).contextualized_prompt
+
+    if len(context_docs) == 0:
+        answer_str = NO_RECOVERED_DOCS
+        write_custom_event(
+            "sub_answers",
+            AgentAnswerPiece(
+                answer_piece=answer_str,
+                level=level,
+                level_question_num=question_num,
+                answer_type="agent_sub_answer",
+            ),
+            writer,
+        )
+    else:
+        fast_llm = graph_config.tooling.fast_llm
+        msg = build_sub_question_answer_prompt(
+            question=question,
+            original_question=graph_config.inputs.search_request.query,
+            docs=context_docs,
+            persona_specification=persona_contextualized_prompt,
+            config=fast_llm.config,
+        )
+
+        response: list[str | list[str | dict[str, Any]]] = []
+        dispatch_timings: list[float] = []
+        for message in fast_llm.stream(
+            prompt=msg,
+        ):
+            # TODO: in principle, the answer here COULD contain images, but we don't support that yet
+            content = message.content
+            if not isinstance(content, str):
+                raise ValueError(
+                    f"Expected content to be a string, but got {type(content)}"
+                )
+            start_stream_token = datetime.now()
+            write_custom_event(
+                "sub_answers",
+                AgentAnswerPiece(
+                    answer_piece=content,
+                    level=level,
+                    level_question_num=question_num,
+                    answer_type="agent_sub_answer",
+                ),
+                writer,
+            )
+            end_stream_token = datetime.now()
+            dispatch_timings.append(
+                (end_stream_token - start_stream_token).microseconds
+            )
+            response.append(content)
+
+        answer_str = merge_message_runs(response, chunk_separator="")[0].content
+        logger.debug(
+            f"Average dispatch time: {sum(dispatch_timings) / len(dispatch_timings)}"
+        )
+
+    answer_citation_ids = get_answer_citation_ids(answer_str)
+    cited_documents = [
+        context_docs[id] for id in answer_citation_ids if id < len(context_docs)
+    ]
+
+    stop_event = StreamStopInfo(
+        stop_reason=StreamStopReason.FINISHED,
+        stream_type=StreamType.SUB_ANSWER,
+        level=level,
+        level_question_num=question_num,
+    )
+    write_custom_event("stream_finished", stop_event, writer)
+
+    return SubQuestionAnswerGenerationUpdate(
+        answer=answer_str,
+        cited_documents=cited_documents,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="initial - generate individual sub answer",
+                node_name="generate sub answer",
+                node_start_time=node_start_time,
+                result="",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/ingest_retrieved_documents.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/nodes/ingest_retrieved_documents.py
@@ -0,0 +1,25 @@
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionRetrievalIngestionUpdate,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalOutput,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+
+
+def ingest_retrieved_documents(
+    state: ExpandedRetrievalOutput,
+) -> SubQuestionRetrievalIngestionUpdate:
+    """
+    LangGraph node to ingest the retrieved documents to format it for the sub-answer.
+    """
+    sub_question_retrieval_stats = state.expanded_retrieval_result.retrieval_stats
+    if sub_question_retrieval_stats is None:
+        sub_question_retrieval_stats = [AgentChunkRetrievalStats()]
+
+    return SubQuestionRetrievalIngestionUpdate(
+        expanded_retrieval_results=state.expanded_retrieval_result.expanded_query_results,
+        verified_reranked_documents=state.expanded_retrieval_result.verified_reranked_documents,
+        context_documents=state.expanded_retrieval_result.context_documents,
+        sub_question_retrieval_stats=sub_question_retrieval_stats,
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/states.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/states.py
@@ -0,0 +1,75 @@
+from operator import add
+from typing import Annotated
+
+from pydantic import BaseModel
+
+from onyx.agents.agent_search.core_state import SubgraphCoreState
+from onyx.agents.agent_search.deep_search.main.states import LoggerUpdate
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    SubQuestionAnswerResults,
+)
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.context.search.models import InferenceSection
+
+
+## Update States
+class SubQuestionAnswerCheckUpdate(LoggerUpdate, BaseModel):
+    answer_quality: bool = False
+    log_messages: list[str] = []
+
+
+class SubQuestionAnswerGenerationUpdate(LoggerUpdate, BaseModel):
+    answer: str = ""
+    log_messages: list[str] = []
+    cited_documents: Annotated[list[InferenceSection], dedup_inference_sections] = []
+    # answer_stat: AnswerStats
+
+
+class SubQuestionRetrievalIngestionUpdate(LoggerUpdate, BaseModel):
+    expanded_retrieval_results: list[QueryRetrievalResult] = []
+    verified_reranked_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ] = []
+    context_documents: Annotated[list[InferenceSection], dedup_inference_sections] = []
+    sub_question_retrieval_stats: AgentChunkRetrievalStats = AgentChunkRetrievalStats()
+
+
+## Graph Input State
+
+
+class SubQuestionAnsweringInput(SubgraphCoreState):
+    question: str = ""
+    question_id: str = (
+        ""  # 0_0 is original question, everything else is <level>_<question_num>.
+    )
+    # level 0 is original question and first decomposition, level 1 is follow up, etc
+    # question_num is a unique number per original question per level.
+
+
+## Graph State
+
+
+class AnswerQuestionState(
+    SubQuestionAnsweringInput,
+    SubQuestionAnswerGenerationUpdate,
+    SubQuestionAnswerCheckUpdate,
+    SubQuestionRetrievalIngestionUpdate,
+):
+    pass
+
+
+## Graph Output State
+
+
+class AnswerQuestionOutput(LoggerUpdate, BaseModel):
+    """
+    This is a list of results even though each call of this subgraph only returns one result.
+    This is because if we parallelize the answer query subgraph, there will be multiple
+      results in a list so the add operator is used to add them together.
+    """
+
+    answer_results: Annotated[list[SubQuestionAnswerResults], add] = []
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/edges.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/edges.py
@@ -0,0 +1,50 @@
+from collections.abc import Hashable
+from datetime import datetime
+
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalState,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
+
+
+def parallelize_initial_sub_question_answering(
+    state: SubQuestionRetrievalState,
+) -> list[Send | Hashable]:
+    """
+    LangGraph edge to parallelize the initial sub-question answering. If there are no sub-questions,
+    we send empty answers to the initial answer generation, and that answer would be generated
+    solely based on the documents retrieved for the original question.
+    """
+    edge_start_time = datetime.now()
+    if len(state.initial_sub_questions) > 0:
+        return [
+            Send(
+                "answer_query_subgraph",
+                SubQuestionAnsweringInput(
+                    question=question,
+                    question_id=make_question_id(0, question_num + 1),
+                    log_messages=[
+                        f"{edge_start_time} -- Main Edge - Parallelize Initial Sub-question Answering"
+                    ],
+                ),
+            )
+            for question_num, question in enumerate(state.initial_sub_questions)
+        ]
+
+    else:
+        return [
+            Send(
+                "ingest_answers",
+                AnswerQuestionOutput(
+                    answer_results=[],
+                ),
+            )
+        ]
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/graph_builder.py
@@ -0,0 +1,96 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.nodes.generate_initial_answer import (
+    generate_initial_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.nodes.validate_initial_answer import (
+    validate_initial_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalInput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_sub_answers.graph_builder import (
+    generate_sub_answers_graph_builder,
+)
+from onyx.agents.agent_search.deep_search.initial.retrieve_orig_question_docs.graph_builder import (
+    retrieve_orig_question_docs_graph_builder,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def generate_initial_answer_graph_builder(test_mode: bool = False) -> StateGraph:
+    """
+    LangGraph graph builder for the initial answer generation.
+    """
+    graph = StateGraph(
+        state_schema=SubQuestionRetrievalState,
+        input=SubQuestionRetrievalInput,
+    )
+
+    # The sub-graph that generates the initial sub-answers
+    generate_sub_answers = generate_sub_answers_graph_builder().compile()
+    graph.add_node(
+        node="generate_sub_answers_subgraph",
+        action=generate_sub_answers,
+    )
+
+    # The sub-graph that retrieves the original question documents. This is run
+    # in parallel with the sub-answer generation process
+    retrieve_orig_question_docs = retrieve_orig_question_docs_graph_builder().compile()
+    graph.add_node(
+        node="retrieve_orig_question_docs_subgraph_wrapper",
+        action=retrieve_orig_question_docs,
+    )
+
+    # Node that generates the initial answer using the results of the previous
+    # two sub-graphs
+    graph.add_node(
+        node="generate_initial_answer",
+        action=generate_initial_answer,
+    )
+
+    # Node that validates the initial answer
+    graph.add_node(
+        node="validate_initial_answer",
+        action=validate_initial_answer,
+    )
+
+    ### Add edges ###
+
+    graph.add_edge(
+        start_key=START,
+        end_key="retrieve_orig_question_docs_subgraph_wrapper",
+    )
+
+    graph.add_edge(
+        start_key=START,
+        end_key="generate_sub_answers_subgraph",
+    )
+
+    # Wait for both, the original question docs and the sub-answers to be generated before proceeding
+    graph.add_edge(
+        start_key=[
+            "retrieve_orig_question_docs_subgraph_wrapper",
+            "generate_sub_answers_subgraph",
+        ],
+        end_key="generate_initial_answer",
+    )
+
+    graph.add_edge(
+        start_key="generate_initial_answer",
+        end_key="validate_initial_answer",
+    )
+
+    graph.add_edge(
+        start_key="validate_initial_answer",
+        end_key=END,
+    )
+
+    return graph
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py
@@ -0,0 +1,313 @@
+from datetime import datetime
+from typing import Any
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_content
+from langchain_core.runnables import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.main.models import AgentBaseMetrics
+from onyx.agents.agent_search.deep_search.main.operations import (
+    calculate_initial_agent_stats,
+)
+from onyx.agents.agent_search.deep_search.main.operations import get_query_info
+from onyx.agents.agent_search.deep_search.main.operations import logger
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialAnswerUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    get_prompt_enrichment_components,
+)
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    trim_prompt_piece,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import InitialAgentResultStats
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    dispatch_main_answer_stop_info,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import format_docs
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import relevance_from_docs
+from onyx.agents.agent_search.shared_graph_utils.utils import remove_document_citations
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import AgentAnswerPiece
+from onyx.chat.models import ExtendedToolResponse
+from onyx.configs.agent_configs import AGENT_MAX_ANSWER_CONTEXT_DOCS
+from onyx.configs.agent_configs import AGENT_MIN_ORIG_QUESTION_DOCS
+from onyx.context.search.models import InferenceSection
+from onyx.prompts.agent_search import (
+    INITIAL_ANSWER_PROMPT_W_SUB_QUESTIONS,
+)
+from onyx.prompts.agent_search import (
+    INITIAL_ANSWER_PROMPT_WO_SUB_QUESTIONS,
+)
+from onyx.prompts.agent_search import (
+    SUB_QUESTION_ANSWER_TEMPLATE,
+)
+from onyx.prompts.agent_search import UNKNOWN_ANSWER
+from onyx.tools.tool_implementations.search.search_tool import yield_search_responses
+
+
+def generate_initial_answer(
+    state: SubQuestionRetrievalState,
+    config: RunnableConfig,
+    writer: StreamWriter,
+) -> InitialAnswerUpdate:
+    """
+    LangGraph node to generate the initial answer, using the initial sub-questions/sub-answers and the
+    documents retrieved for the original question.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = graph_config.inputs.search_request.query
+    prompt_enrichment_components = get_prompt_enrichment_components(graph_config)
+
+    sub_questions_cited_documents = state.cited_documents
+    orig_question_retrieval_documents = state.orig_question_retrieved_documents
+
+    consolidated_context_docs: list[InferenceSection] = sub_questions_cited_documents
+    counter = 0
+    for original_doc_number, original_doc in enumerate(
+        orig_question_retrieval_documents
+    ):
+        if original_doc_number not in sub_questions_cited_documents:
+            if (
+                counter <= AGENT_MIN_ORIG_QUESTION_DOCS
+                or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS
+            ):
+                consolidated_context_docs.append(original_doc)
+                counter += 1
+
+    # sort docs by their scores - though the scores refer to different questions
+    relevant_docs = dedup_inference_sections(
+        consolidated_context_docs, consolidated_context_docs
+    )
+
+    sub_questions: list[str] = []
+    streamed_documents = (
+        relevant_docs
+        if len(relevant_docs) > 0
+        else state.orig_question_retrieved_documents[:15]
+    )
+
+    # Use the query info from the base document retrieval
+    query_info = get_query_info(state.orig_question_sub_query_retrieval_results)
+
+    assert (
+        graph_config.tooling.search_tool
+    ), "search_tool must be provided for agentic search"
+
+    relevance_list = relevance_from_docs(relevant_docs)
+    for tool_response in yield_search_responses(
+        query=question,
+        reranked_sections=streamed_documents,
+        final_context_sections=streamed_documents,
+        search_query_info=query_info,
+        get_section_relevance=lambda: relevance_list,
+        search_tool=graph_config.tooling.search_tool,
+    ):
+        write_custom_event(
+            "tool_response",
+            ExtendedToolResponse(
+                id=tool_response.id,
+                response=tool_response.response,
+                level=0,
+                level_question_num=0,  # 0, 0 is the base question
+            ),
+            writer,
+        )
+
+    if len(relevant_docs) == 0:
+        write_custom_event(
+            "initial_agent_answer",
+            AgentAnswerPiece(
+                answer_piece=UNKNOWN_ANSWER,
+                level=0,
+                level_question_num=0,
+                answer_type="agent_level_answer",
+            ),
+            writer,
+        )
+        dispatch_main_answer_stop_info(0, writer)
+
+        answer = UNKNOWN_ANSWER
+        initial_agent_stats = InitialAgentResultStats(
+            sub_questions={},
+            original_question={},
+            agent_effectiveness={},
+        )
+
+    else:
+        sub_question_answer_results = state.sub_question_results
+
+        # Collect the sub-questions and sub-answers and construct an appropriate
+        # prompt string.
+        # Consider replacing by a function.
+        answered_sub_questions: list[str] = []
+        all_sub_questions: list[str] = []  # Separate list for tracking all questions
+
+        for idx, sub_question_answer_result in enumerate(
+            sub_question_answer_results, start=1
+        ):
+            all_sub_questions.append(sub_question_answer_result.question)
+
+            is_valid_answer = (
+                sub_question_answer_result.verified_high_quality
+                and sub_question_answer_result.answer
+                and sub_question_answer_result.answer != UNKNOWN_ANSWER
+            )
+
+            if is_valid_answer:
+                answered_sub_questions.append(
+                    SUB_QUESTION_ANSWER_TEMPLATE.format(
+                        sub_question=sub_question_answer_result.question,
+                        sub_answer=sub_question_answer_result.answer,
+                        sub_question_num=idx,
+                    )
+                )
+
+        sub_question_answer_str = (
+            "\n\n------\n\n".join(answered_sub_questions)
+            if answered_sub_questions
+            else ""
+        )
+
+        # Use the appropriate prompt based on whether there are sub-questions.
+        base_prompt = (
+            INITIAL_ANSWER_PROMPT_W_SUB_QUESTIONS
+            if answered_sub_questions
+            else INITIAL_ANSWER_PROMPT_WO_SUB_QUESTIONS
+        )
+
+        sub_questions = all_sub_questions  # Replace the original assignment
+
+        model = graph_config.tooling.fast_llm
+
+        doc_context = format_docs(relevant_docs)
+        doc_context = trim_prompt_piece(
+            config=model.config,
+            prompt_piece=doc_context,
+            reserved_str=(
+                base_prompt
+                + sub_question_answer_str
+                + prompt_enrichment_components.persona_prompts.contextualized_prompt
+                + prompt_enrichment_components.history
+                + prompt_enrichment_components.date_str
+            ),
+        )
+
+        msg = [
+            HumanMessage(
+                content=base_prompt.format(
+                    question=question,
+                    answered_sub_questions=remove_document_citations(
+                        sub_question_answer_str
+                    ),
+                    relevant_docs=doc_context,
+                    persona_specification=prompt_enrichment_components.persona_prompts.contextualized_prompt,
+                    history=prompt_enrichment_components.history,
+                    date_prompt=prompt_enrichment_components.date_str,
+                )
+            )
+        ]
+
+        streamed_tokens: list[str | list[str | dict[str, Any]]] = [""]
+        dispatch_timings: list[float] = []
+        for message in model.stream(msg):
+            # TODO: in principle, the answer here COULD contain images, but we don't support that yet
+            content = message.content
+            if not isinstance(content, str):
+                raise ValueError(
+                    f"Expected content to be a string, but got {type(content)}"
+                )
+            start_stream_token = datetime.now()
+
+            write_custom_event(
+                "initial_agent_answer",
+                AgentAnswerPiece(
+                    answer_piece=content,
+                    level=0,
+                    level_question_num=0,
+                    answer_type="agent_level_answer",
+                ),
+                writer,
+            )
+            end_stream_token = datetime.now()
+            dispatch_timings.append(
+                (end_stream_token - start_stream_token).microseconds
+            )
+            streamed_tokens.append(content)
+
+        logger.debug(
+            f"Average dispatch time for initial answer: {sum(dispatch_timings) / len(dispatch_timings)}"
+        )
+
+        dispatch_main_answer_stop_info(0, writer)
+        response = merge_content(*streamed_tokens)
+        answer = cast(str, response)
+
+        initial_agent_stats = calculate_initial_agent_stats(
+            state.sub_question_results, state.orig_question_retrieval_stats
+        )
+
+        logger.debug(
+            f"\n\nYYYYY--Sub-Questions:\n\n{sub_question_answer_str}\n\nStats:\n\n"
+        )
+
+        if initial_agent_stats:
+            logger.debug(initial_agent_stats.original_question)
+            logger.debug(initial_agent_stats.sub_questions)
+            logger.debug(initial_agent_stats.agent_effectiveness)
+
+    agent_base_end_time = datetime.now()
+
+    if agent_base_end_time and state.agent_start_time:
+        duration_s = (agent_base_end_time - state.agent_start_time).total_seconds()
+    else:
+        duration_s = None
+
+    agent_base_metrics = AgentBaseMetrics(
+        num_verified_documents_total=len(relevant_docs),
+        num_verified_documents_core=state.orig_question_retrieval_stats.verified_count,
+        verified_avg_score_core=state.orig_question_retrieval_stats.verified_avg_scores,
+        num_verified_documents_base=initial_agent_stats.sub_questions.get(
+            "num_verified_documents"
+        ),
+        verified_avg_score_base=initial_agent_stats.sub_questions.get(
+            "verified_avg_score"
+        ),
+        base_doc_boost_factor=initial_agent_stats.agent_effectiveness.get(
+            "utilized_chunk_ratio"
+        ),
+        support_boost_factor=initial_agent_stats.agent_effectiveness.get(
+            "support_ratio"
+        ),
+        duration_s=duration_s,
+    )
+
+    return InitialAnswerUpdate(
+        initial_answer=answer,
+        initial_agent_stats=initial_agent_stats,
+        generated_sub_questions=sub_questions,
+        agent_base_end_time=agent_base_end_time,
+        agent_base_metrics=agent_base_metrics,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="initial - generate initial answer",
+                node_name="generate initial answer",
+                node_start_time=node_start_time,
+                result="",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/validate_initial_answer.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/validate_initial_answer.py
@@ -0,0 +1,40 @@
+from datetime import datetime
+
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.main.operations import logger
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialAnswerQualityUpdate,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+
+
+def validate_initial_answer(
+    state: SubQuestionRetrievalState,
+) -> InitialAnswerQualityUpdate:
+    """
+    Check whether the initial answer sufficiently addresses the original user question.
+    """
+
+    node_start_time = datetime.now()
+
+    logger.debug(
+        f"--------{node_start_time}--------Checking for base answer validity - for not set True/False manually"
+    )
+
+    verdict = True
+
+    return InitialAnswerQualityUpdate(
+        initial_answer_quality_eval=verdict,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="initial - generate initial answer",
+                node_name="validate initial answer",
+                node_start_time=node_start_time,
+                result="",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/states.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/states.py
@@ -0,0 +1,51 @@
+from operator import add
+from typing import Annotated
+from typing import TypedDict
+
+from onyx.agents.agent_search.core_state import CoreState
+from onyx.agents.agent_search.deep_search.main.states import (
+    ExploratorySearchUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialAnswerQualityUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialAnswerUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialQuestionDecompositionUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    OrigQuestionRetrievalUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    SubQuestionResultsUpdate,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.models import (
+    QuestionRetrievalResult,
+)
+from onyx.context.search.models import InferenceSection
+
+
+### States ###
+class SubQuestionRetrievalInput(CoreState):
+    exploratory_search_results: list[InferenceSection]
+
+
+## Graph State
+class SubQuestionRetrievalState(
+    # This includes the core state
+    SubQuestionRetrievalInput,
+    InitialQuestionDecompositionUpdate,
+    InitialAnswerUpdate,
+    SubQuestionResultsUpdate,
+    OrigQuestionRetrievalUpdate,
+    InitialAnswerQualityUpdate,
+    ExploratorySearchUpdate,
+):
+    base_raw_search_result: Annotated[list[QuestionRetrievalResult], add]
+
+
+## Graph Output State
+class SubQuestionRetrievalOutput(TypedDict):
+    log_messages: list[str]
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/edges.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/edges.py
@@ -0,0 +1,48 @@
+from collections.abc import Hashable
+from datetime import datetime
+
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalState,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
+
+
+def parallelize_initial_sub_question_answering(
+    state: SubQuestionRetrievalState,
+) -> list[Send | Hashable]:
+    """
+    LangGraph edge to parallelize the initial sub-question answering.
+    """
+    edge_start_time = datetime.now()
+    if len(state.initial_sub_questions) > 0:
+        return [
+            Send(
+                "answer_sub_question_subgraphs",
+                SubQuestionAnsweringInput(
+                    question=question,
+                    question_id=make_question_id(0, question_num + 1),
+                    log_messages=[
+                        f"{edge_start_time} -- Main Edge - Parallelize Initial Sub-question Answering"
+                    ],
+                ),
+            )
+            for question_num, question in enumerate(state.initial_sub_questions)
+        ]
+
+    else:
+        return [
+            Send(
+                "ingest_answers",
+                AnswerQuestionOutput(
+                    answer_results=[],
+                ),
+            )
+        ]
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/graph_builder.py
@@ -0,0 +1,81 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.graph_builder import (
+    answer_query_graph_builder,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_sub_answers.edges import (
+    parallelize_initial_sub_question_answering,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_sub_answers.nodes.decompose_orig_question import (
+    decompose_orig_question,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_sub_answers.nodes.format_initial_sub_answers import (
+    format_initial_sub_answers,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_sub_answers.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_sub_answers.states import (
+    SubQuestionAnsweringState,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+test_mode = False
+
+
+def generate_sub_answers_graph_builder() -> StateGraph:
+    """
+    LangGraph graph builder for the initial sub-answer generation process.
+    It generates the initial sub-questions and produces the answers.
+    """
+
+    graph = StateGraph(
+        state_schema=SubQuestionAnsweringState,
+        input=SubQuestionAnsweringInput,
+    )
+
+    # Decompose the original question into sub-questions
+    graph.add_node(
+        node="decompose_orig_question",
+        action=decompose_orig_question,
+    )
+
+    # The sub-graph that executes the initial sub-question answering for
+    # each of the sub-questions.
+    answer_sub_question_subgraphs = answer_query_graph_builder().compile()
+    graph.add_node(
+        node="answer_sub_question_subgraphs",
+        action=answer_sub_question_subgraphs,
+    )
+
+    # Node that collects and formats the initial sub-question answers
+    graph.add_node(
+        node="format_initial_sub_question_answers",
+        action=format_initial_sub_answers,
+    )
+
+    graph.add_edge(
+        start_key=START,
+        end_key="decompose_orig_question",
+    )
+
+    graph.add_conditional_edges(
+        source="decompose_orig_question",
+        path=parallelize_initial_sub_question_answering,
+        path_map=["answer_sub_question_subgraphs"],
+    )
+    graph.add_edge(
+        start_key=["answer_sub_question_subgraphs"],
+        end_key="format_initial_sub_question_answers",
+    )
+
+    graph.add_edge(
+        start_key="format_initial_sub_question_answers",
+        end_key=END,
+    )
+
+    return graph
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/nodes/decompose_orig_question.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/nodes/decompose_orig_question.py
@@ -0,0 +1,153 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_content
+from langchain_core.runnables import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.states import (
+    SubQuestionRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.main.models import (
+    AgentRefinedMetrics,
+)
+from onyx.agents.agent_search.deep_search.main.operations import (
+    dispatch_subquestion,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialQuestionDecompositionUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    build_history_prompt,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import dispatch_separated
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import StreamStopInfo
+from onyx.chat.models import StreamStopReason
+from onyx.chat.models import StreamType
+from onyx.chat.models import SubQuestionPiece
+from onyx.configs.agent_configs import AGENT_NUM_DOCS_FOR_DECOMPOSITION
+from onyx.prompts.agent_search import (
+    INITIAL_DECOMPOSITION_PROMPT_QUESTIONS_AFTER_SEARCH,
+)
+from onyx.prompts.agent_search import (
+    INITIAL_QUESTION_DECOMPOSITION_PROMPT,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def decompose_orig_question(
+    state: SubQuestionRetrievalState,
+    config: RunnableConfig,
+    writer: StreamWriter,
+) -> InitialQuestionDecompositionUpdate:
+    """
+    LangGraph node to decompose the original question into sub-questions.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = graph_config.inputs.search_request.query
+    perform_initial_search_decomposition = (
+        graph_config.behavior.perform_initial_search_decomposition
+    )
+    # Get the rewritten queries in a defined format
+    model = graph_config.tooling.fast_llm
+
+    history = build_history_prompt(graph_config, question)
+
+    # Use the initial search results to inform the decomposition
+    agent_start_time = datetime.now()
+
+    # Initial search to inform decomposition. Just get top 3 fits
+
+    if perform_initial_search_decomposition:
+        # Due to unfortunate state representation in LangGraph, we need here to double check that the retrieval has
+        # happened prior to this point, allowing silent failure here since it is not critical for decomposition in
+        # all queries.
+        if not state.exploratory_search_results:
+            logger.error("Initial search for decomposition failed")
+
+        sample_doc_str = "\n\n".join(
+            [
+                doc.combined_content
+                for doc in state.exploratory_search_results[
+                    :AGENT_NUM_DOCS_FOR_DECOMPOSITION
+                ]
+            ]
+        )
+
+        decomposition_prompt = (
+            INITIAL_DECOMPOSITION_PROMPT_QUESTIONS_AFTER_SEARCH.format(
+                question=question, sample_doc_str=sample_doc_str, history=history
+            )
+        )
+
+    else:
+        decomposition_prompt = INITIAL_QUESTION_DECOMPOSITION_PROMPT.format(
+            question=question, history=history
+        )
+
+    # Start decomposition
+
+    msg = [HumanMessage(content=decomposition_prompt)]
+
+    # Send the initial question as a subquestion with number 0
+    write_custom_event(
+        "decomp_qs",
+        SubQuestionPiece(
+            sub_question=question,
+            level=0,
+            level_question_num=0,
+        ),
+        writer,
+    )
+    # dispatches custom events for subquestion tokens, adding in subquestion ids.
+    streamed_tokens = dispatch_separated(
+        model.stream(msg), dispatch_subquestion(0, writer)
+    )
+
+    stop_event = StreamStopInfo(
+        stop_reason=StreamStopReason.FINISHED,
+        stream_type=StreamType.SUB_QUESTIONS,
+        level=0,
+    )
+    write_custom_event("stream_finished", stop_event, writer)
+
+    deomposition_response = merge_content(*streamed_tokens)
+
+    # this call should only return strings. Commenting out for efficiency
+    # assert [type(tok) == str for tok in streamed_tokens]
+
+    # use no-op cast() instead of str() which runs code
+    # list_of_subquestions = clean_and_parse_list_string(cast(str, response))
+    list_of_subqs = cast(str, deomposition_response).split("\n")
+
+    decomp_list: list[str] = [sq.strip() for sq in list_of_subqs if sq.strip() != ""]
+
+    return InitialQuestionDecompositionUpdate(
+        initial_sub_questions=decomp_list,
+        agent_start_time=agent_start_time,
+        agent_refined_start_time=None,
+        agent_refined_end_time=None,
+        agent_refined_metrics=AgentRefinedMetrics(
+            refined_doc_boost_factor=None,
+            refined_question_boost_factor=None,
+            duration_s=None,
+        ),
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="initial - generate sub answers",
+                node_name="decompose original question",
+                node_start_time=node_start_time,
+                result=f"decomposed original question into {len(decomp_list)} subquestions",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/nodes/format_initial_sub_answers.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/nodes/format_initial_sub_answers.py
@@ -0,0 +1,50 @@
+from datetime import datetime
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    SubQuestionResultsUpdate,
+)
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+
+
+def format_initial_sub_answers(
+    state: AnswerQuestionOutput,
+) -> SubQuestionResultsUpdate:
+    """
+    LangGraph node to format the answers to the initial sub-questions, including
+    deduping verified documents and context documents.
+    """
+    node_start_time = datetime.now()
+
+    documents = []
+    context_documents = []
+    cited_documents = []
+    answer_results = state.answer_results
+    for answer_result in answer_results:
+        documents.extend(answer_result.verified_reranked_documents)
+        context_documents.extend(answer_result.context_documents)
+        cited_documents.extend(answer_result.cited_documents)
+
+    return SubQuestionResultsUpdate(
+        # Deduping is done by the documents operator for the main graph
+        # so we might not need to dedup here
+        verified_reranked_documents=dedup_inference_sections(documents, []),
+        context_documents=dedup_inference_sections(context_documents, []),
+        cited_documents=dedup_inference_sections(cited_documents, []),
+        sub_question_results=answer_results,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="initial - generate sub answers",
+                node_name="format initial sub answers",
+                node_start_time=node_start_time,
+                result="",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/states.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_sub_answers/states.py
@@ -0,0 +1,34 @@
+from typing import TypedDict
+
+from onyx.agents.agent_search.core_state import CoreState
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialAnswerUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialQuestionDecompositionUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    SubQuestionResultsUpdate,
+)
+from onyx.context.search.models import InferenceSection
+
+
+### States ###
+class SubQuestionAnsweringInput(CoreState):
+    exploratory_search_results: list[InferenceSection]
+
+
+## Graph State
+class SubQuestionAnsweringState(
+    # This includes the core state
+    SubQuestionAnsweringInput,
+    InitialQuestionDecompositionUpdate,
+    InitialAnswerUpdate,
+    SubQuestionResultsUpdate,
+):
+    pass
+
+
+## Graph Output State
+class SubQuestionAnsweringOutput(TypedDict):
+    log_messages: list[str]
--- a/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/graph_builder.py
@@ -0,0 +1,81 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.initial.retrieve_orig_question_docs.nodes.format_orig_question_search_input import (
+    format_orig_question_search_input,
+)
+from onyx.agents.agent_search.deep_search.initial.retrieve_orig_question_docs.nodes.format_orig_question_search_output import (
+    format_orig_question_search_output,
+)
+from onyx.agents.agent_search.deep_search.initial.retrieve_orig_question_docs.states import (
+    BaseRawSearchInput,
+)
+from onyx.agents.agent_search.deep_search.initial.retrieve_orig_question_docs.states import (
+    BaseRawSearchOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.retrieve_orig_question_docs.states import (
+    BaseRawSearchState,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.graph_builder import (
+    expanded_retrieval_graph_builder,
+)
+
+
+def retrieve_orig_question_docs_graph_builder() -> StateGraph:
+    """
+    LangGraph graph builder for the retrieval of documents
+    that are relevant to the original question. This is
+    largely a wrapper around the expanded retrieval process to
+    ensure parallelism with the sub-question answer process.
+    """
+    graph = StateGraph(
+        state_schema=BaseRawSearchState,
+        input=BaseRawSearchInput,
+        output=BaseRawSearchOutput,
+    )
+
+    ### Add nodes ###
+
+    # Format the original question search output
+    graph.add_node(
+        node="format_orig_question_search_output",
+        action=format_orig_question_search_output,
+    )
+
+    # The sub-graph that executes the expanded retrieval process
+    expanded_retrieval = expanded_retrieval_graph_builder().compile()
+    graph.add_node(
+        node="retrieve_orig_question_docs_subgraph",
+        action=expanded_retrieval,
+    )
+
+    # Format the original question search input
+    graph.add_node(
+        node="format_orig_question_search_input",
+        action=format_orig_question_search_input,
+    )
+
+    ### Add edges ###
+
+    graph.add_edge(start_key=START, end_key="format_orig_question_search_input")
+
+    graph.add_edge(
+        start_key="format_orig_question_search_input",
+        end_key="retrieve_orig_question_docs_subgraph",
+    )
+    graph.add_edge(
+        start_key="retrieve_orig_question_docs_subgraph",
+        end_key="format_orig_question_search_output",
+    )
+
+    graph.add_edge(
+        start_key="format_orig_question_search_output",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    pass
--- a/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/nodes/format_orig_question_search_input.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/nodes/format_orig_question_search_input.py
@@ -0,0 +1,28 @@
+from typing import cast
+
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.core_state import CoreState
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalInput,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def format_orig_question_search_input(
+    state: CoreState, config: RunnableConfig
+) -> ExpandedRetrievalInput:
+    """
+    LangGraph node to format the search input for the original question.
+    """
+    logger.debug("generate_raw_search_data")
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    return ExpandedRetrievalInput(
+        question=graph_config.inputs.search_request.query,
+        base_search=True,
+        sub_question_id=None,  # This graph is always and only used for the original question
+        log_messages=[],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/nodes/format_orig_question_search_output.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/nodes/format_orig_question_search_output.py
@@ -0,0 +1,30 @@
+from onyx.agents.agent_search.deep_search.main.states import OrigQuestionRetrievalUpdate
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalOutput,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def format_orig_question_search_output(
+    state: ExpandedRetrievalOutput,
+) -> OrigQuestionRetrievalUpdate:
+    """
+    LangGraph node to format the search result for the original question into the
+    proper format.
+    """
+    sub_question_retrieval_stats = state.expanded_retrieval_result.retrieval_stats
+    if sub_question_retrieval_stats is None:
+        sub_question_retrieval_stats = AgentChunkRetrievalStats()
+    else:
+        sub_question_retrieval_stats = sub_question_retrieval_stats
+
+    return OrigQuestionRetrievalUpdate(
+        orig_question_verified_reranked_documents=state.expanded_retrieval_result.verified_reranked_documents,
+        orig_question_sub_query_retrieval_results=state.expanded_retrieval_result.expanded_query_results,
+        orig_question_retrieved_documents=state.retrieved_documents,
+        orig_question_retrieval_stats=sub_question_retrieval_stats,
+        log_messages=[],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/states.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/retrieve_orig_question_docs/states.py
@@ -0,0 +1,29 @@
+from onyx.agents.agent_search.deep_search.main.states import (
+    OrigQuestionRetrievalUpdate,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalInput,
+)
+
+
+## Graph Input State
+class BaseRawSearchInput(ExpandedRetrievalInput):
+    pass
+
+
+## Graph Output State
+class BaseRawSearchOutput(OrigQuestionRetrievalUpdate):
+    """
+    This is a list of results even though each call of this subgraph only returns one result.
+    This is because if we parallelize the answer query subgraph, there will be multiple
+      results in a list so the add operator is used to add them together.
+    """
+
+    # base_expanded_retrieval_result: QuestionRetrievalResult = QuestionRetrievalResult()
+
+
+## Graph State
+class BaseRawSearchState(
+    BaseRawSearchInput, BaseRawSearchOutput, OrigQuestionRetrievalUpdate
+):
+    pass
--- a/backend/onyx/agents/agent_search/deep_search/main/edges.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/edges.py
@@ -0,0 +1,113 @@
+from collections.abc import Hashable
+from datetime import datetime
+from typing import cast
+from typing import Literal
+
+from langchain_core.runnables import RunnableConfig
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.deep_search.main.states import (
+    RequireRefinemenEvalUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def route_initial_tool_choice(
+    state: MainState, config: RunnableConfig
+) -> Literal["tool_call", "start_agent_search", "logging_node"]:
+    """
+    LangGraph edge to route to agent search.
+    """
+    agent_config = cast(GraphConfig, config["metadata"]["config"])
+    if state.tool_choice is not None:
+        if (
+            agent_config.behavior.use_agentic_search
+            and agent_config.tooling.search_tool is not None
+            and state.tool_choice.tool.name == agent_config.tooling.search_tool.name
+        ):
+            return "start_agent_search"
+        else:
+            return "tool_call"
+    else:
+        return "logging_node"
+
+
+def parallelize_initial_sub_question_answering(
+    state: MainState,
+) -> list[Send | Hashable]:
+    edge_start_time = datetime.now()
+    if len(state.initial_sub_questions) > 0:
+        return [
+            Send(
+                "answer_query_subgraph",
+                SubQuestionAnsweringInput(
+                    question=question,
+                    question_id=make_question_id(0, question_num + 1),
+                    log_messages=[
+                        f"{edge_start_time} -- Main Edge - Parallelize Initial Sub-question Answering"
+                    ],
+                ),
+            )
+            for question_num, question in enumerate(state.initial_sub_questions)
+        ]
+
+    else:
+        return [
+            Send(
+                "ingest_answers",
+                AnswerQuestionOutput(
+                    answer_results=[],
+                ),
+            )
+        ]
+
+
+# Define the function that determines whether to continue or not
+def continue_to_refined_answer_or_end(
+    state: RequireRefinemenEvalUpdate,
+) -> Literal["create_refined_sub_questions", "logging_node"]:
+    if state.require_refined_answer_eval:
+        return "create_refined_sub_questions"
+    else:
+        return "logging_node"
+
+
+def parallelize_refined_sub_question_answering(
+    state: MainState,
+) -> list[Send | Hashable]:
+    edge_start_time = datetime.now()
+    if len(state.refined_sub_questions) > 0:
+        return [
+            Send(
+                "answer_refined_question_subgraphs",
+                SubQuestionAnsweringInput(
+                    question=question_data.sub_question,
+                    question_id=make_question_id(1, question_num),
+                    log_messages=[
+                        f"{edge_start_time} -- Main Edge - Parallelize Refined Sub-question Answering"
+                    ],
+                ),
+            )
+            for question_num, question_data in state.refined_sub_questions.items()
+        ]
+
+    else:
+        return [
+            Send(
+                "ingest_refined_sub_answers",
+                AnswerQuestionOutput(
+                    answer_results=[],
+                ),
+            )
+        ]
--- a/backend/onyx/agents/agent_search/deep_search/main/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/graph_builder.py
@@ -0,0 +1,265 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.initial.generate_initial_answer.graph_builder import (
+    generate_initial_answer_graph_builder,
+)
+from onyx.agents.agent_search.deep_search.main.edges import (
+    continue_to_refined_answer_or_end,
+)
+from onyx.agents.agent_search.deep_search.main.edges import (
+    parallelize_refined_sub_question_answering,
+)
+from onyx.agents.agent_search.deep_search.main.edges import (
+    route_initial_tool_choice,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.compare_answers import (
+    compare_answers,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.create_refined_sub_questions import (
+    create_refined_sub_questions,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.decide_refinement_need import (
+    decide_refinement_need,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.extract_entities_terms import (
+    extract_entities_terms,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.generate_refined_answer import (
+    generate_refined_answer,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.ingest_refined_sub_answers import (
+    ingest_refined_sub_answers,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.persist_agent_results import (
+    persist_agent_results,
+)
+from onyx.agents.agent_search.deep_search.main.nodes.start_agent_search import (
+    start_agent_search,
+)
+from onyx.agents.agent_search.deep_search.main.states import MainInput
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.deep_search.refinement.consolidate_sub_answers.graph_builder import (
+    answer_refined_query_graph_builder,
+)
+from onyx.agents.agent_search.orchestration.nodes.basic_use_tool_response import (
+    basic_use_tool_response,
+)
+from onyx.agents.agent_search.orchestration.nodes.llm_tool_choice import llm_tool_choice
+from onyx.agents.agent_search.orchestration.nodes.prepare_tool_input import (
+    prepare_tool_input,
+)
+from onyx.agents.agent_search.orchestration.nodes.tool_call import tool_call
+from onyx.agents.agent_search.shared_graph_utils.utils import get_test_config
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+test_mode = False
+
+
+def main_graph_builder(test_mode: bool = False) -> StateGraph:
+    """
+    LangGraph graph builder for the main agent search process.
+    """
+    graph = StateGraph(
+        state_schema=MainState,
+        input=MainInput,
+    )
+
+    # Prepare the tool input
+    graph.add_node(
+        node="prepare_tool_input",
+        action=prepare_tool_input,
+    )
+
+    # Choose the initial tool
+    graph.add_node(
+        node="initial_tool_choice",
+        action=llm_tool_choice,
+    )
+
+    # Call the tool, if required
+    graph.add_node(
+        node="tool_call",
+        action=tool_call,
+    )
+
+    # Use the tool response
+    graph.add_node(
+        node="basic_use_tool_response",
+        action=basic_use_tool_response,
+    )
+
+    # Start the agent search process
+    graph.add_node(
+        node="start_agent_search",
+        action=start_agent_search,
+    )
+
+    # The sub-graph for the initial answer generation
+    generate_initial_answer_subgraph = generate_initial_answer_graph_builder().compile()
+    graph.add_node(
+        node="generate_initial_answer_subgraph",
+        action=generate_initial_answer_subgraph,
+    )
+
+    # Create the refined sub-questions
+    graph.add_node(
+        node="create_refined_sub_questions",
+        action=create_refined_sub_questions,
+    )
+
+    # Subgraph for the refined sub-answer generation
+    answer_refined_question = answer_refined_query_graph_builder().compile()
+    graph.add_node(
+        node="answer_refined_question_subgraphs",
+        action=answer_refined_question,
+    )
+
+    # Ingest the refined sub-answers
+    graph.add_node(
+        node="ingest_refined_sub_answers",
+        action=ingest_refined_sub_answers,
+    )
+
+    # Node to generate the refined answer
+    graph.add_node(
+        node="generate_refined_answer",
+        action=generate_refined_answer,
+    )
+
+    # Early node to extract the entities and terms from the initial answer,
+    # This information is used to inform the creation the refined sub-questions
+    graph.add_node(
+        node="extract_entity_term",
+        action=extract_entities_terms,
+    )
+
+    # Decide if the answer needs to be refined (currently always true)
+    graph.add_node(
+        node="decide_refinement_need",
+        action=decide_refinement_need,
+    )
+
+    # Compare the initial and refined answers, and determine whether
+    # the refined answer is sufficiently better
+    graph.add_node(
+        node="compare_answers",
+        action=compare_answers,
+    )
+
+    # Log the results. This will log the stats as well as the answers, sub-questions, and sub-answers
+    graph.add_node(
+        node="logging_node",
+        action=persist_agent_results,
+    )
+
+    ### Add edges ###
+
+    graph.add_edge(start_key=START, end_key="prepare_tool_input")
+
+    graph.add_edge(
+        start_key="prepare_tool_input",
+        end_key="initial_tool_choice",
+    )
+
+    graph.add_conditional_edges(
+        "initial_tool_choice",
+        route_initial_tool_choice,
+        ["tool_call", "start_agent_search", "logging_node"],
+    )
+
+    graph.add_edge(
+        start_key="tool_call",
+        end_key="basic_use_tool_response",
+    )
+    graph.add_edge(
+        start_key="basic_use_tool_response",
+        end_key="logging_node",
+    )
+
+    graph.add_edge(
+        start_key="start_agent_search",
+        end_key="generate_initial_answer_subgraph",
+    )
+
+    graph.add_edge(
+        start_key="start_agent_search",
+        end_key="extract_entity_term",
+    )
+
+    # Wait for the initial answer generation and the entity/term extraction to be complete
+    # before deciding if a refinement is needed.
+    graph.add_edge(
+        start_key=["generate_initial_answer_subgraph", "extract_entity_term"],
+        end_key="decide_refinement_need",
+    )
+
+    graph.add_conditional_edges(
+        source="decide_refinement_need",
+        path=continue_to_refined_answer_or_end,
+        path_map=["create_refined_sub_questions", "logging_node"],
+    )
+
+    graph.add_conditional_edges(
+        source="create_refined_sub_questions",
+        path=parallelize_refined_sub_question_answering,
+        path_map=["answer_refined_question_subgraphs"],
+    )
+    graph.add_edge(
+        start_key="answer_refined_question_subgraphs",
+        end_key="ingest_refined_sub_answers",
+    )
+
+    graph.add_edge(
+        start_key="ingest_refined_sub_answers",
+        end_key="generate_refined_answer",
+    )
+
+    graph.add_edge(
+        start_key="generate_refined_answer",
+        end_key="compare_answers",
+    )
+    graph.add_edge(
+        start_key="compare_answers",
+        end_key="logging_node",
+    )
+
+    graph.add_edge(
+        start_key="logging_node",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    pass
+
+    from onyx.db.engine import get_session_context_manager
+    from onyx.llm.factory import get_default_llms
+    from onyx.context.search.models import SearchRequest
+
+    graph = main_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+
+    with get_session_context_manager() as db_session:
+        search_request = SearchRequest(query="Who created Excel?")
+        graph_config = get_test_config(
+            db_session, primary_llm, fast_llm, search_request
+        )
+
+        inputs = MainInput(
+            base_question=graph_config.inputs.search_request.query, log_messages=[]
+        )
+
+        for thing in compiled_graph.stream(
+            input=inputs,
+            config={"configurable": {"config": graph_config}},
+            stream_mode="custom",
+            subgraphs=True,
+        ):
+            logger.debug(thing)
--- a/backend/onyx/agents/agent_search/deep_search/main/models.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/models.py
@@ -0,0 +1,36 @@
+from pydantic import BaseModel
+
+
+class RefinementSubQuestion(BaseModel):
+    sub_question: str
+    sub_question_id: str
+    verified: bool
+    answered: bool
+    answer: str
+
+
+class AgentTimings(BaseModel):
+    base_duration_s: float | None
+    refined_duration_s: float | None
+    full_duration_s: float | None
+
+
+class AgentBaseMetrics(BaseModel):
+    num_verified_documents_total: int | None
+    num_verified_documents_core: int | None
+    verified_avg_score_core: float | None
+    num_verified_documents_base: int | float | None
+    verified_avg_score_base: float | None = None
+    base_doc_boost_factor: float | None = None
+    support_boost_factor: float | None = None
+    duration_s: float | None = None
+
+
+class AgentRefinedMetrics(BaseModel):
+    refined_doc_boost_factor: float | None = None
+    refined_question_boost_factor: float | None = None
+    duration_s: float | None = None
+
+
+class AgentAdditionalMetrics(BaseModel):
+    pass
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/compare_answers.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/compare_answers.py
@@ -0,0 +1,71 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.main.states import (
+    InitialRefinedAnswerComparisonUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import RefinedAnswerImprovement
+from onyx.prompts.agent_search import (
+    INITIAL_REFINED_ANSWER_COMPARISON_PROMPT,
+)
+
+
+def compare_answers(
+    state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
+) -> InitialRefinedAnswerComparisonUpdate:
+    """
+    LangGraph node to compare the initial answer and the refined answer and determine if the
+    refined answer is sufficiently better than the initial answer.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = graph_config.inputs.search_request.query
+    initial_answer = state.initial_answer
+    refined_answer = state.refined_answer
+
+    compare_answers_prompt = INITIAL_REFINED_ANSWER_COMPARISON_PROMPT.format(
+        question=question, initial_answer=initial_answer, refined_answer=refined_answer
+    )
+
+    msg = [HumanMessage(content=compare_answers_prompt)]
+
+    # Get the rewritten queries in a defined format
+    model = graph_config.tooling.fast_llm
+
+    # no need to stream this
+    resp = model.invoke(msg)
+
+    refined_answer_improvement = (
+        isinstance(resp.content, str) and "yes" in resp.content.lower()
+    )
+
+    write_custom_event(
+        "refined_answer_improvement",
+        RefinedAnswerImprovement(
+            refined_answer_improvement=refined_answer_improvement,
+        ),
+        writer,
+    )
+
+    return InitialRefinedAnswerComparisonUpdate(
+        refined_answer_improvement_eval=refined_answer_improvement,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="compare answers",
+                node_start_time=node_start_time,
+                result=f"Answer comparison: {refined_answer_improvement}",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/create_refined_sub_questions.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/create_refined_sub_questions.py
@@ -0,0 +1,131 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_content
+from langchain_core.runnables import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.main.models import (
+    RefinementSubQuestion,
+)
+from onyx.agents.agent_search.deep_search.main.operations import (
+    dispatch_subquestion,
+)
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.deep_search.main.states import (
+    RefinedQuestionDecompositionUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    build_history_prompt,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import dispatch_separated
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    format_entity_term_extraction,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.prompts.agent_search import (
+    REFINEMENT_QUESTION_DECOMPOSITION_PROMPT,
+)
+from onyx.tools.models import ToolCallKickoff
+
+
+def create_refined_sub_questions(
+    state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
+) -> RefinedQuestionDecompositionUpdate:
+    """
+    LangGraph node to create refined sub-questions based on the initial answer, the history,
+    the entity term extraction results found earlier, and the sub-questions that were answered and failed.
+    """
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    write_custom_event(
+        "start_refined_answer_creation",
+        ToolCallKickoff(
+            tool_name="agent_search_1",
+            tool_args={
+                "query": graph_config.inputs.search_request.query,
+                "answer": state.initial_answer,
+            },
+        ),
+        writer,
+    )
+
+    node_start_time = datetime.now()
+
+    agent_refined_start_time = datetime.now()
+
+    question = graph_config.inputs.search_request.query
+    base_answer = state.initial_answer
+    history = build_history_prompt(graph_config, question)
+    # get the entity term extraction dict and properly format it
+    entity_retlation_term_extractions = state.entity_relation_term_extractions
+
+    entity_term_extraction_str = format_entity_term_extraction(
+        entity_retlation_term_extractions
+    )
+
+    initial_question_answers = state.sub_question_results
+
+    addressed_question_list = [
+        x.question for x in initial_question_answers if x.verified_high_quality
+    ]
+
+    failed_question_list = [
+        x.question for x in initial_question_answers if not x.verified_high_quality
+    ]
+
+    msg = [
+        HumanMessage(
+            content=REFINEMENT_QUESTION_DECOMPOSITION_PROMPT.format(
+                question=question,
+                history=history,
+                entity_term_extraction_str=entity_term_extraction_str,
+                base_answer=base_answer,
+                answered_sub_questions="\n - ".join(addressed_question_list),
+                failed_sub_questions="\n - ".join(failed_question_list),
+            ),
+        )
+    ]
+
+    # Grader
+    model = graph_config.tooling.fast_llm
+
+    streamed_tokens = dispatch_separated(
+        model.stream(msg), dispatch_subquestion(1, writer)
+    )
+    response = merge_content(*streamed_tokens)
+
+    if isinstance(response, str):
+        parsed_response = [q for q in response.split("\n") if q.strip() != ""]
+    else:
+        raise ValueError("LLM response is not a string")
+
+    refined_sub_question_dict = {}
+    for sub_question_num, sub_question in enumerate(parsed_response):
+        refined_sub_question = RefinementSubQuestion(
+            sub_question=sub_question,
+            sub_question_id=make_question_id(1, sub_question_num + 1),
+            verified=False,
+            answered=False,
+            answer="",
+        )
+
+        refined_sub_question_dict[sub_question_num + 1] = refined_sub_question
+
+    return RefinedQuestionDecompositionUpdate(
+        refined_sub_questions=refined_sub_question_dict,
+        agent_refined_start_time=agent_refined_start_time,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="create refined sub questions",
+                node_start_time=node_start_time,
+                result=f"Created {len(refined_sub_question_dict)} refined sub questions",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/decide_refinement_need.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/decide_refinement_need.py
@@ -0,0 +1,47 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.runnables import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.deep_search.main.states import (
+    RequireRefinemenEvalUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+
+
+def decide_refinement_need(
+    state: MainState, config: RunnableConfig
+) -> RequireRefinemenEvalUpdate:
+    """
+    LangGraph node to decide if refinement is needed based on the initial answer and the question.
+    At present, we always refine.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+
+    decision = True  # TODO: just for current testing purposes
+
+    log_messages = [
+        get_langgraph_node_log_string(
+            graph_component="main",
+            node_name="decide refinement need",
+            node_start_time=node_start_time,
+            result=f"Refinement decision: {decision}",
+        )
+    ]
+
+    if graph_config.behavior.allow_refinement:
+        return RequireRefinemenEvalUpdate(
+            require_refined_answer_eval=decision,
+            log_messages=log_messages,
+        )
+    else:
+        return RequireRefinemenEvalUpdate(
+            require_refined_answer_eval=False,
+            log_messages=log_messages,
+        )
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/extract_entities_terms.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/extract_entities_terms.py
@@ -0,0 +1,116 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.runnables import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.main.operations import logger
+from onyx.agents.agent_search.deep_search.main.states import (
+    EntityTermExtractionUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    trim_prompt_piece,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import EntityExtractionResult
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    EntityRelationshipTermExtraction,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import format_docs
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.configs.constants import NUM_EXPLORATORY_DOCS
+from onyx.prompts.agent_search import ENTITY_TERM_EXTRACTION_PROMPT
+from onyx.prompts.agent_search import ENTITY_TERM_EXTRACTION_PROMPT_JSON_EXAMPLE
+
+
+def extract_entities_terms(
+    state: MainState, config: RunnableConfig
+) -> EntityTermExtractionUpdate:
+    """
+    LangGraph node to extract entities, relationships, and terms from the initial search results.
+    This data is used to inform particularly the sub-questions that are created for the refined answer.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    if not graph_config.behavior.allow_refinement:
+        return EntityTermExtractionUpdate(
+            entity_relation_term_extractions=EntityRelationshipTermExtraction(
+                entities=[],
+                relationships=[],
+                terms=[],
+            ),
+            log_messages=[
+                get_langgraph_node_log_string(
+                    graph_component="main",
+                    node_name="extract entities terms",
+                    node_start_time=node_start_time,
+                    result="Refinement is not allowed",
+                )
+            ],
+        )
+
+    # first four lines duplicates from generate_initial_answer
+    question = graph_config.inputs.search_request.query
+    initial_search_docs = state.exploratory_search_results[:NUM_EXPLORATORY_DOCS]
+
+    # start with the entity/term/extraction
+    doc_context = format_docs(initial_search_docs)
+
+    # Calculation here is only approximate
+    doc_context = trim_prompt_piece(
+        graph_config.tooling.fast_llm.config,
+        doc_context,
+        ENTITY_TERM_EXTRACTION_PROMPT
+        + question
+        + ENTITY_TERM_EXTRACTION_PROMPT_JSON_EXAMPLE,
+    )
+
+    msg = [
+        HumanMessage(
+            content=ENTITY_TERM_EXTRACTION_PROMPT.format(
+                question=question, context=doc_context
+            )
+            + ENTITY_TERM_EXTRACTION_PROMPT_JSON_EXAMPLE,
+        )
+    ]
+    fast_llm = graph_config.tooling.fast_llm
+    # Grader
+    llm_response = fast_llm.invoke(
+        prompt=msg,
+    )
+
+    cleaned_response = (
+        str(llm_response.content).replace("```json\n", "").replace("\n```", "")
+    )
+    first_bracket = cleaned_response.find("{")
+    last_bracket = cleaned_response.rfind("}")
+    cleaned_response = cleaned_response[first_bracket : last_bracket + 1]
+
+    try:
+        entity_extraction_result = EntityExtractionResult.model_validate_json(
+            cleaned_response
+        )
+    except ValueError:
+        logger.error("Failed to parse LLM response as JSON in Entity-Term Extraction")
+        entity_extraction_result = EntityExtractionResult(
+            retrieved_entities_relationships=EntityRelationshipTermExtraction(
+                entities=[],
+                relationships=[],
+                terms=[],
+            ),
+        )
+
+    return EntityTermExtractionUpdate(
+        entity_relation_term_extractions=entity_extraction_result.retrieved_entities_relationships,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="extract entities terms",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/generate_refined_answer.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/generate_refined_answer.py
@@ -0,0 +1,339 @@
+from datetime import datetime
+from typing import Any
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_content
+from langchain_core.runnables import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.main.models import (
+    AgentRefinedMetrics,
+)
+from onyx.agents.agent_search.deep_search.main.operations import get_query_info
+from onyx.agents.agent_search.deep_search.main.operations import logger
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.deep_search.main.states import (
+    RefinedAnswerUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    get_prompt_enrichment_components,
+)
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    trim_prompt_piece,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import InferenceSection
+from onyx.agents.agent_search.shared_graph_utils.models import RefinedAgentStats
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    dispatch_main_answer_stop_info,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import format_docs
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.agents.agent_search.shared_graph_utils.utils import relevance_from_docs
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    remove_document_citations,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import AgentAnswerPiece
+from onyx.chat.models import ExtendedToolResponse
+from onyx.configs.agent_configs import AGENT_MAX_ANSWER_CONTEXT_DOCS
+from onyx.configs.agent_configs import AGENT_MIN_ORIG_QUESTION_DOCS
+from onyx.prompts.agent_search import (
+    REFINED_ANSWER_PROMPT_W_SUB_QUESTIONS,
+)
+from onyx.prompts.agent_search import (
+    REFINED_ANSWER_PROMPT_WO_SUB_QUESTIONS,
+)
+from onyx.prompts.agent_search import (
+    SUB_QUESTION_ANSWER_TEMPLATE_REFINED,
+)
+from onyx.prompts.agent_search import UNKNOWN_ANSWER
+from onyx.tools.tool_implementations.search.search_tool import yield_search_responses
+
+
+def generate_refined_answer(
+    state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
+) -> RefinedAnswerUpdate:
+    """
+    LangGraph node to generate the refined answer.
+    """
+
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = graph_config.inputs.search_request.query
+    prompt_enrichment_components = get_prompt_enrichment_components(graph_config)
+
+    persona_contextualized_prompt = (
+        prompt_enrichment_components.persona_prompts.contextualized_prompt
+    )
+
+    verified_reranked_documents = state.verified_reranked_documents
+    sub_questions_cited_documents = state.cited_documents
+    original_question_verified_documents = (
+        state.orig_question_verified_reranked_documents
+    )
+    original_question_retrieved_documents = state.orig_question_retrieved_documents
+
+    consolidated_context_docs: list[InferenceSection] = sub_questions_cited_documents
+
+    counter = 0
+    for original_doc_number, original_doc in enumerate(
+        original_question_verified_documents
+    ):
+        if original_doc_number not in sub_questions_cited_documents:
+            if (
+                counter <= AGENT_MIN_ORIG_QUESTION_DOCS
+                or len(consolidated_context_docs)
+                < 1.5
+                * AGENT_MAX_ANSWER_CONTEXT_DOCS  # allow for larger context in refinement
+            ):
+                consolidated_context_docs.append(original_doc)
+                counter += 1
+
+    # sort docs by their scores - though the scores refer to different questions
+    relevant_docs = dedup_inference_sections(
+        consolidated_context_docs, consolidated_context_docs
+    )
+
+    streaming_docs = (
+        relevant_docs
+        if len(relevant_docs) > 0
+        else original_question_retrieved_documents[:15]
+    )
+
+    query_info = get_query_info(state.orig_question_sub_query_retrieval_results)
+    assert (
+        graph_config.tooling.search_tool
+    ), "search_tool must be provided for agentic search"
+    # stream refined answer docs, or original question docs if no relevant docs are found
+    relevance_list = relevance_from_docs(relevant_docs)
+    for tool_response in yield_search_responses(
+        query=question,
+        reranked_sections=streaming_docs,
+        final_context_sections=streaming_docs,
+        search_query_info=query_info,
+        get_section_relevance=lambda: relevance_list,
+        search_tool=graph_config.tooling.search_tool,
+    ):
+        write_custom_event(
+            "tool_response",
+            ExtendedToolResponse(
+                id=tool_response.id,
+                response=tool_response.response,
+                level=1,
+                level_question_num=0,  # 0, 0 is the base question
+            ),
+            writer,
+        )
+
+    if len(verified_reranked_documents) > 0:
+        refined_doc_effectiveness = len(relevant_docs) / len(
+            verified_reranked_documents
+        )
+    else:
+        refined_doc_effectiveness = 10.0
+
+    sub_question_answer_results = state.sub_question_results
+
+    answered_sub_question_answer_list: list[str] = []
+    sub_questions: list[str] = []
+    initial_answered_sub_questions: set[str] = set()
+    refined_answered_sub_questions: set[str] = set()
+
+    for i, result in enumerate(sub_question_answer_results, 1):
+        question_level, _ = parse_question_id(result.question_id)
+        sub_questions.append(result.question)
+
+        if (
+            result.verified_high_quality
+            and result.answer
+            and result.answer != UNKNOWN_ANSWER
+        ):
+            sub_question_type = "initial" if question_level == 0 else "refined"
+            question_set = (
+                initial_answered_sub_questions
+                if question_level == 0
+                else refined_answered_sub_questions
+            )
+            question_set.add(result.question)
+
+            answered_sub_question_answer_list.append(
+                SUB_QUESTION_ANSWER_TEMPLATE_REFINED.format(
+                    sub_question=result.question,
+                    sub_answer=result.answer,
+                    sub_question_num=i,
+                    sub_question_type=sub_question_type,
+                )
+            )
+
+    # Calculate efficiency
+    total_answered_questions = (
+        initial_answered_sub_questions | refined_answered_sub_questions
+    )
+    revision_question_efficiency = (
+        len(total_answered_questions) / len(initial_answered_sub_questions)
+        if initial_answered_sub_questions
+        else 10.0
+        if refined_answered_sub_questions
+        else 1.0
+    )
+
+    sub_question_answer_str = "\n\n------\n\n".join(
+        set(answered_sub_question_answer_list)
+    )
+    initial_answer = state.initial_answer or ""
+
+    # Choose appropriate prompt template
+    base_prompt = (
+        REFINED_ANSWER_PROMPT_W_SUB_QUESTIONS
+        if answered_sub_question_answer_list
+        else REFINED_ANSWER_PROMPT_WO_SUB_QUESTIONS
+    )
+
+    model = graph_config.tooling.fast_llm
+    relevant_docs_str = format_docs(relevant_docs)
+    relevant_docs_str = trim_prompt_piece(
+        model.config,
+        relevant_docs_str,
+        base_prompt
+        + question
+        + sub_question_answer_str
+        + initial_answer
+        + persona_contextualized_prompt
+        + prompt_enrichment_components.history,
+    )
+
+    msg = [
+        HumanMessage(
+            content=base_prompt.format(
+                question=question,
+                history=prompt_enrichment_components.history,
+                answered_sub_questions=remove_document_citations(
+                    sub_question_answer_str
+                ),
+                relevant_docs=relevant_docs_str,
+                initial_answer=remove_document_citations(initial_answer)
+                if initial_answer
+                else None,
+                persona_specification=persona_contextualized_prompt,
+                date_prompt=prompt_enrichment_components.date_str,
+            )
+        )
+    ]
+
+    streamed_tokens: list[str | list[str | dict[str, Any]]] = [""]
+    dispatch_timings: list[float] = []
+    for message in model.stream(msg):
+        # TODO: in principle, the answer here COULD contain images, but we don't support that yet
+        content = message.content
+        if not isinstance(content, str):
+            raise ValueError(
+                f"Expected content to be a string, but got {type(content)}"
+            )
+
+        start_stream_token = datetime.now()
+        write_custom_event(
+            "refined_agent_answer",
+            AgentAnswerPiece(
+                answer_piece=content,
+                level=1,
+                level_question_num=0,
+                answer_type="agent_level_answer",
+            ),
+            writer,
+        )
+        end_stream_token = datetime.now()
+        dispatch_timings.append((end_stream_token - start_stream_token).microseconds)
+        streamed_tokens.append(content)
+
+    logger.debug(
+        f"Average dispatch time for refined answer: {sum(dispatch_timings) / len(dispatch_timings)}"
+    )
+    dispatch_main_answer_stop_info(1, writer)
+    response = merge_content(*streamed_tokens)
+    answer = cast(str, response)
+
+    refined_agent_stats = RefinedAgentStats(
+        revision_doc_efficiency=refined_doc_effectiveness,
+        revision_question_efficiency=revision_question_efficiency,
+    )
+
+    logger.debug(f"\n\n---INITIAL ANSWER ---\n\n Answer:\n Agent: {initial_answer}")
+    logger.debug("-" * 10)
+    logger.debug(f"\n\n---REVISED AGENT ANSWER ---\n\n Answer:\n Agent: {answer}")
+
+    logger.debug("-" * 100)
+
+    if state.initial_agent_stats:
+        initial_doc_boost_factor = state.initial_agent_stats.agent_effectiveness.get(
+            "utilized_chunk_ratio", "--"
+        )
+        initial_support_boost_factor = (
+            state.initial_agent_stats.agent_effectiveness.get("support_ratio", "--")
+        )
+        num_initial_verified_docs = state.initial_agent_stats.original_question.get(
+            "num_verified_documents", "--"
+        )
+        initial_verified_docs_avg_score = (
+            state.initial_agent_stats.original_question.get("verified_avg_score", "--")
+        )
+        initial_sub_questions_verified_docs = (
+            state.initial_agent_stats.sub_questions.get("num_verified_documents", "--")
+        )
+
+        logger.debug("INITIAL AGENT STATS")
+        logger.debug(f"Document Boost Factor: {initial_doc_boost_factor}")
+        logger.debug(f"Support Boost Factor: {initial_support_boost_factor}")
+        logger.debug(f"Originally Verified Docs: {num_initial_verified_docs}")
+        logger.debug(
+            f"Originally Verified Docs Avg Score: {initial_verified_docs_avg_score}"
+        )
+        logger.debug(
+            f"Sub-Questions Verified Docs: {initial_sub_questions_verified_docs}"
+        )
+    if refined_agent_stats:
+        logger.debug("-" * 10)
+        logger.debug("REFINED AGENT STATS")
+        logger.debug(
+            f"Revision Doc Factor: {refined_agent_stats.revision_doc_efficiency}"
+        )
+        logger.debug(
+            f"Revision Question Factor: {refined_agent_stats.revision_question_efficiency}"
+        )
+
+    agent_refined_end_time = datetime.now()
+    if state.agent_refined_start_time:
+        agent_refined_duration = (
+            agent_refined_end_time - state.agent_refined_start_time
+        ).total_seconds()
+    else:
+        agent_refined_duration = None
+
+    agent_refined_metrics = AgentRefinedMetrics(
+        refined_doc_boost_factor=refined_agent_stats.revision_doc_efficiency,
+        refined_question_boost_factor=refined_agent_stats.revision_question_efficiency,
+        duration_s=agent_refined_duration,
+    )
+
+    return RefinedAnswerUpdate(
+        refined_answer=answer,
+        refined_answer_quality=True,  # TODO: replace this with the actual check value
+        refined_agent_stats=refined_agent_stats,
+        agent_refined_end_time=agent_refined_end_time,
+        agent_refined_metrics=agent_refined_metrics,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="generate refined answer",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/ingest_refined_sub_answers.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/ingest_refined_sub_answers.py
@@ -0,0 +1,42 @@
+from datetime import datetime
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    SubQuestionResultsUpdate,
+)
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+
+
+def ingest_refined_sub_answers(
+    state: AnswerQuestionOutput,
+) -> SubQuestionResultsUpdate:
+    """
+    LangGraph node to ingest and format the refined sub-answers and retrieved documents.
+    """
+    node_start_time = datetime.now()
+
+    documents = []
+    answer_results = state.answer_results
+    for answer_result in answer_results:
+        documents.extend(answer_result.verified_reranked_documents)
+
+    return SubQuestionResultsUpdate(
+        # Deduping is done by the documents operator for the main graph
+        # so we might not need to dedup here
+        verified_reranked_documents=dedup_inference_sections(documents, []),
+        sub_question_results=answer_results,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="ingest refined answers",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/persist_agent_results.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/persist_agent_results.py
@@ -0,0 +1,129 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.runnables import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.main.models import (
+    AgentAdditionalMetrics,
+)
+from onyx.agents.agent_search.deep_search.main.models import AgentTimings
+from onyx.agents.agent_search.deep_search.main.operations import logger
+from onyx.agents.agent_search.deep_search.main.states import MainOutput
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.models import CombinedAgentMetrics
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.db.chat import log_agent_metrics
+from onyx.db.chat import log_agent_sub_question_results
+
+
+def persist_agent_results(state: MainState, config: RunnableConfig) -> MainOutput:
+    """
+    LangGraph node to persist the agent results, including agent logging data.
+    """
+    node_start_time = datetime.now()
+
+    agent_start_time = state.agent_start_time
+    agent_base_end_time = state.agent_base_end_time
+    agent_refined_start_time = state.agent_refined_start_time
+    agent_refined_end_time = state.agent_refined_end_time
+    agent_end_time = agent_refined_end_time or agent_base_end_time
+
+    agent_base_duration = None
+    if agent_base_end_time and agent_start_time:
+        agent_base_duration = (agent_base_end_time - agent_start_time).total_seconds()
+
+    agent_refined_duration = None
+    if agent_refined_start_time and agent_refined_end_time:
+        agent_refined_duration = (
+            agent_refined_end_time - agent_refined_start_time
+        ).total_seconds()
+
+    agent_full_duration = None
+    if agent_end_time and agent_start_time:
+        agent_full_duration = (agent_end_time - agent_start_time).total_seconds()
+
+    agent_type = "refined" if agent_refined_duration else "base"
+
+    agent_base_metrics = state.agent_base_metrics
+    agent_refined_metrics = state.agent_refined_metrics
+
+    combined_agent_metrics = CombinedAgentMetrics(
+        timings=AgentTimings(
+            base_duration_s=agent_base_duration,
+            refined_duration_s=agent_refined_duration,
+            full_duration_s=agent_full_duration,
+        ),
+        base_metrics=agent_base_metrics,
+        refined_metrics=agent_refined_metrics,
+        additional_metrics=AgentAdditionalMetrics(),
+    )
+
+    persona_id = None
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    if graph_config.inputs.search_request.persona:
+        persona_id = graph_config.inputs.search_request.persona.id
+
+    user_id = None
+    assert (
+        graph_config.tooling.search_tool
+    ), "search_tool must be provided for agentic search"
+    user = graph_config.tooling.search_tool.user
+    if user:
+        user_id = user.id
+
+    # log the agent metrics
+    if graph_config.persistence:
+        if agent_base_duration is not None:
+            log_agent_metrics(
+                db_session=graph_config.persistence.db_session,
+                user_id=user_id,
+                persona_id=persona_id,
+                agent_type=agent_type,
+                start_time=agent_start_time,
+                agent_metrics=combined_agent_metrics,
+            )
+
+        # Persist the sub-answer in the database
+        db_session = graph_config.persistence.db_session
+        chat_session_id = graph_config.persistence.chat_session_id
+        primary_message_id = graph_config.persistence.message_id
+        sub_question_answer_results = state.sub_question_results
+
+        log_agent_sub_question_results(
+            db_session=db_session,
+            chat_session_id=chat_session_id,
+            primary_message_id=primary_message_id,
+            sub_question_answer_results=sub_question_answer_results,
+        )
+
+    main_output = MainOutput(
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="persist agent results",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
+
+    for log_message in state.log_messages:
+        logger.debug(log_message)
+
+    if state.agent_base_metrics:
+        logger.debug(f"Initial loop: {state.agent_base_metrics.duration_s}")
+    if state.agent_refined_metrics:
+        logger.debug(f"Refined loop: {state.agent_refined_metrics.duration_s}")
+    if (
+        state.agent_base_metrics
+        and state.agent_refined_metrics
+        and state.agent_base_metrics.duration_s
+        and state.agent_refined_metrics.duration_s
+    ):
+        logger.debug(
+            f"Total time: {float(state.agent_base_metrics.duration_s) + float(state.agent_refined_metrics.duration_s)}"
+        )
+
+    return main_output
--- a/backend/onyx/agents/agent_search/deep_search/main/nodes/start_agent_search.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/start_agent_search.py
@@ -0,0 +1,52 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.runnables import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.main.states import (
+    ExploratorySearchUpdate,
+)
+from onyx.agents.agent_search.deep_search.main.states import MainState
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    build_history_prompt,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import retrieve_search_docs
+from onyx.configs.agent_configs import AGENT_EXPLORATORY_SEARCH_RESULTS
+from onyx.context.search.models import InferenceSection
+
+
+def start_agent_search(
+    state: MainState, config: RunnableConfig
+) -> ExploratorySearchUpdate:
+    """
+    LangGraph node to start the agentic search process.
+    """
+    node_start_time = datetime.now()
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = graph_config.inputs.search_request.query
+
+    history = build_history_prompt(graph_config, question)
+
+    # Initial search to inform decomposition. Just get top 3 fits
+    search_tool = graph_config.tooling.search_tool
+    assert search_tool, "search_tool must be provided for agentic search"
+    retrieved_docs: list[InferenceSection] = retrieve_search_docs(search_tool, question)
+
+    exploratory_search_results = retrieved_docs[:AGENT_EXPLORATORY_SEARCH_RESULTS]
+
+    return ExploratorySearchUpdate(
+        exploratory_search_results=exploratory_search_results,
+        previous_history_summary=history,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="main",
+                node_name="start agent search",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/operations.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/operations.py
@@ -0,0 +1,132 @@
+from collections.abc import Callable
+
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.agents.agent_search.shared_graph_utils.models import InitialAgentResultStats
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    SubQuestionAnswerResults,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import SubQuestionPiece
+from onyx.context.search.models import IndexFilters
+from onyx.tools.models import SearchQueryInfo
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def dispatch_subquestion(
+    level: int, writer: StreamWriter
+) -> Callable[[str, int], None]:
+    def _helper(sub_question_part: str, sep_num: int) -> None:
+        write_custom_event(
+            "decomp_qs",
+            SubQuestionPiece(
+                sub_question=sub_question_part,
+                level=level,
+                level_question_num=sep_num,
+            ),
+            writer,
+        )
+
+    return _helper
+
+
+def calculate_initial_agent_stats(
+    decomp_answer_results: list[SubQuestionAnswerResults],
+    original_question_stats: AgentChunkRetrievalStats,
+) -> InitialAgentResultStats:
+    initial_agent_result_stats: InitialAgentResultStats = InitialAgentResultStats(
+        sub_questions={},
+        original_question={},
+        agent_effectiveness={},
+    )
+
+    orig_verified = original_question_stats.verified_count
+    orig_support_score = original_question_stats.verified_avg_scores
+
+    verified_document_chunk_ids = []
+    support_scores = 0.0
+
+    for decomp_answer_result in decomp_answer_results:
+        verified_document_chunk_ids += (
+            decomp_answer_result.sub_question_retrieval_stats.verified_doc_chunk_ids
+        )
+        if (
+            decomp_answer_result.sub_question_retrieval_stats.verified_avg_scores
+            is not None
+        ):
+            support_scores += (
+                decomp_answer_result.sub_question_retrieval_stats.verified_avg_scores
+            )
+
+    verified_document_chunk_ids = list(set(verified_document_chunk_ids))
+
+    # Calculate sub-question stats
+    if (
+        verified_document_chunk_ids
+        and len(verified_document_chunk_ids) > 0
+        and support_scores is not None
+    ):
+        sub_question_stats: dict[str, float | int | None] = {
+            "num_verified_documents": len(verified_document_chunk_ids),
+            "verified_avg_score": float(support_scores / len(decomp_answer_results)),
+        }
+    else:
+        sub_question_stats = {"num_verified_documents": 0, "verified_avg_score": None}
+
+    initial_agent_result_stats.sub_questions.update(sub_question_stats)
+
+    # Get original question stats
+    initial_agent_result_stats.original_question.update(
+        {
+            "num_verified_documents": original_question_stats.verified_count,
+            "verified_avg_score": original_question_stats.verified_avg_scores,
+        }
+    )
+
+    # Calculate chunk utilization ratio
+    sub_verified = initial_agent_result_stats.sub_questions["num_verified_documents"]
+
+    chunk_ratio: float | None = None
+    if sub_verified is not None and orig_verified is not None and orig_verified > 0:
+        chunk_ratio = (float(sub_verified) / orig_verified) if sub_verified > 0 else 0.0
+    elif sub_verified is not None and sub_verified > 0:
+        chunk_ratio = 10.0
+
+    initial_agent_result_stats.agent_effectiveness["utilized_chunk_ratio"] = chunk_ratio
+
+    if (
+        orig_support_score is None
+        or orig_support_score == 0.0
+        and initial_agent_result_stats.sub_questions["verified_avg_score"] is None
+    ):
+        initial_agent_result_stats.agent_effectiveness["support_ratio"] = None
+    elif orig_support_score is None or orig_support_score == 0.0:
+        initial_agent_result_stats.agent_effectiveness["support_ratio"] = 10
+    elif initial_agent_result_stats.sub_questions["verified_avg_score"] is None:
+        initial_agent_result_stats.agent_effectiveness["support_ratio"] = 0
+    else:
+        initial_agent_result_stats.agent_effectiveness["support_ratio"] = (
+            initial_agent_result_stats.sub_questions["verified_avg_score"]
+            / orig_support_score
+        )
+
+    return initial_agent_result_stats
+
+
+def get_query_info(results: list[QueryRetrievalResult]) -> SearchQueryInfo:
+    # Use the query info from the base document retrieval
+    # this is used for some fields that are the same across the searches done
+    query_info = None
+    for result in results:
+        if result.query_info is not None:
+            query_info = result.query_info
+            break
+    return query_info or SearchQueryInfo(
+        predicted_search=None,
+        final_filters=IndexFilters(access_control_list=None),
+        recency_bias_multiplier=1.0,
+    )
--- a/backend/onyx/agents/agent_search/deep_search/main/states.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/states.py
@@ -0,0 +1,172 @@
+from datetime import datetime
+from operator import add
+from typing import Annotated
+from typing import TypedDict
+
+from pydantic import BaseModel
+
+from onyx.agents.agent_search.core_state import CoreState
+from onyx.agents.agent_search.deep_search.main.models import AgentBaseMetrics
+from onyx.agents.agent_search.deep_search.main.models import (
+    AgentRefinedMetrics,
+)
+from onyx.agents.agent_search.deep_search.main.models import (
+    RefinementSubQuestion,
+)
+from onyx.agents.agent_search.orchestration.states import ToolCallUpdate
+from onyx.agents.agent_search.orchestration.states import ToolChoiceInput
+from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    EntityRelationshipTermExtraction,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import InitialAgentResultStats
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.agents.agent_search.shared_graph_utils.models import RefinedAgentStats
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    SubQuestionAnswerResults,
+)
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_question_answer_results,
+)
+from onyx.context.search.models import InferenceSection
+
+
+### States ###
+class LoggerUpdate(BaseModel):
+    log_messages: Annotated[list[str], add] = []
+
+
+class RefinedAgentStartStats(BaseModel):
+    agent_refined_start_time: datetime | None = None
+
+
+class RefinedAgentEndStats(BaseModel):
+    agent_refined_end_time: datetime | None = None
+    agent_refined_metrics: AgentRefinedMetrics = AgentRefinedMetrics()
+
+
+class InitialQuestionDecompositionUpdate(
+    RefinedAgentStartStats, RefinedAgentEndStats, LoggerUpdate
+):
+    agent_start_time: datetime | None = None
+    previous_history: str | None = None
+    initial_sub_questions: list[str] = []
+
+
+class ExploratorySearchUpdate(LoggerUpdate):
+    exploratory_search_results: list[InferenceSection] = []
+    previous_history_summary: str | None = None
+
+
+class InitialRefinedAnswerComparisonUpdate(LoggerUpdate):
+    """
+    Evaluation of whether the refined answer is better than the initial answer
+    """
+
+    refined_answer_improvement_eval: bool = False
+
+
+class InitialAnswerUpdate(LoggerUpdate):
+    """
+    Initial answer information
+    """
+
+    initial_answer: str | None = None
+    initial_agent_stats: InitialAgentResultStats | None = None
+    generated_sub_questions: list[str] = []
+    agent_base_end_time: datetime | None = None
+    agent_base_metrics: AgentBaseMetrics | None = None
+
+
+class RefinedAnswerUpdate(RefinedAgentEndStats, LoggerUpdate):
+    """
+    Refined answer information
+    """
+
+    refined_answer: str | None = None
+    refined_agent_stats: RefinedAgentStats | None = None
+    refined_answer_quality: bool = False
+
+
+class InitialAnswerQualityUpdate(LoggerUpdate):
+    """
+    Initial answer quality evaluation
+    """
+
+    initial_answer_quality_eval: bool = False
+
+
+class RequireRefinemenEvalUpdate(LoggerUpdate):
+    require_refined_answer_eval: bool = True
+
+
+class SubQuestionResultsUpdate(LoggerUpdate):
+    verified_reranked_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ] = []
+    context_documents: Annotated[list[InferenceSection], dedup_inference_sections] = []
+    cited_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ] = []  # cited docs from sub-answers are used for answer context
+    sub_question_results: Annotated[
+        list[SubQuestionAnswerResults], dedup_question_answer_results
+    ] = []
+
+
+class OrigQuestionRetrievalUpdate(LoggerUpdate):
+    orig_question_retrieved_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ]
+    orig_question_verified_reranked_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ]
+    orig_question_sub_query_retrieval_results: list[QueryRetrievalResult] = []
+    orig_question_retrieval_stats: AgentChunkRetrievalStats = AgentChunkRetrievalStats()
+
+
+class EntityTermExtractionUpdate(LoggerUpdate):
+    entity_relation_term_extractions: EntityRelationshipTermExtraction = (
+        EntityRelationshipTermExtraction()
+    )
+
+
+class RefinedQuestionDecompositionUpdate(RefinedAgentStartStats, LoggerUpdate):
+    refined_sub_questions: dict[int, RefinementSubQuestion] = {}
+
+
+## Graph Input State
+class MainInput(CoreState):
+    pass
+
+
+## Graph State
+class MainState(
+    # This includes the core state
+    MainInput,
+    ToolChoiceInput,
+    ToolCallUpdate,
+    ToolChoiceUpdate,
+    InitialQuestionDecompositionUpdate,
+    InitialAnswerUpdate,
+    SubQuestionResultsUpdate,
+    OrigQuestionRetrievalUpdate,
+    EntityTermExtractionUpdate,
+    InitialAnswerQualityUpdate,
+    RequireRefinemenEvalUpdate,
+    RefinedQuestionDecompositionUpdate,
+    RefinedAnswerUpdate,
+    RefinedAgentStartStats,
+    RefinedAgentEndStats,
+    InitialRefinedAnswerComparisonUpdate,
+    ExploratorySearchUpdate,
+):
+    pass
+
+
+## Graph Output State - presently not used
+class MainOutput(TypedDict):
+    log_messages: list[str]
--- a/backend/onyx/agents/agent_search/deep_search/refinement/consolidate_sub_answers/edges.py
+++ b/backend/onyx/agents/agent_search/deep_search/refinement/consolidate_sub_answers/edges.py
@@ -0,0 +1,33 @@
+from collections.abc import Hashable
+from datetime import datetime
+
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalInput,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def send_to_expanded_refined_retrieval(
+    state: SubQuestionAnsweringInput,
+) -> Send | Hashable:
+    """
+    LangGraph edge to sends a refined sub-question extended retrieval.
+    """
+    logger.debug("sending to expanded retrieval for follow up question via edge")
+    datetime.now()
+    return Send(
+        "refined_sub_question_expanded_retrieval",
+        ExpandedRetrievalInput(
+            question=state.question,
+            sub_question_id=state.question_id,
+            base_search=False,
+            log_messages=[f"{datetime.now()} -- Sending to expanded retrieval"],
+        ),
+    )
--- a/backend/onyx/agents/agent_search/deep_search/refinement/consolidate_sub_answers/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/refinement/consolidate_sub_answers/graph_builder.py
@@ -0,0 +1,132 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.check_sub_answer import (
+    check_sub_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.format_sub_answer import (
+    format_sub_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.generate_sub_answer import (
+    generate_sub_answer,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.nodes.ingest_retrieved_documents import (
+    ingest_retrieved_documents,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionOutput,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    AnswerQuestionState,
+)
+from onyx.agents.agent_search.deep_search.initial.generate_individual_sub_answer.states import (
+    SubQuestionAnsweringInput,
+)
+from onyx.agents.agent_search.deep_search.refinement.consolidate_sub_answers.edges import (
+    send_to_expanded_refined_retrieval,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.graph_builder import (
+    expanded_retrieval_graph_builder,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def answer_refined_query_graph_builder() -> StateGraph:
+    """
+    LangGraph graph builder for the refined sub-answer generation process.
+    """
+    graph = StateGraph(
+        state_schema=AnswerQuestionState,
+        input=SubQuestionAnsweringInput,
+        output=AnswerQuestionOutput,
+    )
+
+    ### Add nodes ###
+
+    # Subgraph for the expanded retrieval process
+    expanded_retrieval = expanded_retrieval_graph_builder().compile()
+    graph.add_node(
+        node="refined_sub_question_expanded_retrieval",
+        action=expanded_retrieval,
+    )
+
+    # Ingest the retrieved documents
+    graph.add_node(
+        node="ingest_refined_retrieval",
+        action=ingest_retrieved_documents,
+    )
+
+    # Generate the refined sub-answer
+    graph.add_node(
+        node="generate_refined_sub_answer",
+        action=generate_sub_answer,
+    )
+
+    # Check if the refined sub-answer is correct
+    graph.add_node(
+        node="refined_sub_answer_check",
+        action=check_sub_answer,
+    )
+
+    # Format the refined sub-answer
+    graph.add_node(
+        node="format_refined_sub_answer",
+        action=format_sub_answer,
+    )
+
+    ### Add edges ###
+
+    graph.add_conditional_edges(
+        source=START,
+        path=send_to_expanded_refined_retrieval,
+        path_map=["refined_sub_question_expanded_retrieval"],
+    )
+    graph.add_edge(
+        start_key="refined_sub_question_expanded_retrieval",
+        end_key="ingest_refined_retrieval",
+    )
+    graph.add_edge(
+        start_key="ingest_refined_retrieval",
+        end_key="generate_refined_sub_answer",
+    )
+    graph.add_edge(
+        start_key="generate_refined_sub_answer",
+        end_key="refined_sub_answer_check",
+    )
+    graph.add_edge(
+        start_key="refined_sub_answer_check",
+        end_key="format_refined_sub_answer",
+    )
+    graph.add_edge(
+        start_key="format_refined_sub_answer",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    from onyx.db.engine import get_session_context_manager
+    from onyx.llm.factory import get_default_llms
+    from onyx.context.search.models import SearchRequest
+
+    graph = answer_refined_query_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+    search_request = SearchRequest(
+        query="what can you do with onyx or danswer?",
+    )
+    with get_session_context_manager() as db_session:
+        inputs = SubQuestionAnsweringInput(
+            question="what can you do with onyx?",
+            question_id="0_0",
+            log_messages=[],
+        )
+        for thing in compiled_graph.stream(
+            input=inputs,
+            stream_mode="custom",
+        ):
+            logger.debug(thing)
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/edges.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/edges.py
@@ -0,0 +1,42 @@
+from collections.abc import Hashable
+from typing import cast
+
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    RetrievalInput,
+)
+from onyx.agents.agent_search.models import GraphConfig
+
+
+def parallel_retrieval_edge(
+    state: ExpandedRetrievalState, config: RunnableConfig
+) -> list[Send | Hashable]:
+    """
+    LangGraph edge to parallelize the retrieval process for each of the
+    generated sub-queries and the original question.
+    """
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = (
+        state.question if state.question else graph_config.inputs.search_request.query
+    )
+
+    query_expansions = state.expanded_queries + [question]
+
+    return [
+        Send(
+            "retrieve_documents",
+            RetrievalInput(
+                query_to_retrieve=query,
+                question=question,
+                base_search=False,
+                sub_question_id=state.sub_question_id,
+                log_messages=[],
+            ),
+        )
+        for query in query_expansions
+    ]
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/graph_builder.py
@@ -0,0 +1,161 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.edges import (
+    parallel_retrieval_edge,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.expand_queries import (
+    expand_queries,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.format_queries import (
+    format_queries,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.format_results import (
+    format_results,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.kickoff_verification import (
+    kickoff_verification,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.rerank_documents import (
+    rerank_documents,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.retrieve_documents import (
+    retrieve_documents,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.nodes.verify_documents import (
+    verify_documents,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalOutput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalState,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import get_test_config
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def expanded_retrieval_graph_builder() -> StateGraph:
+    """
+    LangGraph graph builder for the expanded retrieval process.
+    """
+    graph = StateGraph(
+        state_schema=ExpandedRetrievalState,
+        input=ExpandedRetrievalInput,
+        output=ExpandedRetrievalOutput,
+    )
+
+    ### Add nodes ###
+
+    # Convert the question into multiple sub-queries
+    graph.add_node(
+        node="expand_queries",
+        action=expand_queries,
+    )
+
+    # Format the sub-queries into a list of strings
+    graph.add_node(
+        node="format_queries",
+        action=format_queries,
+    )
+
+    # Retrieve the documents for each sub-query
+    graph.add_node(
+        node="retrieve_documents",
+        action=retrieve_documents,
+    )
+
+    # Start verification process that the documents are relevant to the question (not the query)
+    graph.add_node(
+        node="kickoff_verification",
+        action=kickoff_verification,
+    )
+
+    # Verify that a given document is relevant to the question (not the query)
+    graph.add_node(
+        node="verify_documents",
+        action=verify_documents,
+    )
+
+    # Rerank the documents that have been verified
+    graph.add_node(
+        node="rerank_documents",
+        action=rerank_documents,
+    )
+
+    # Format the results into a list of strings
+    graph.add_node(
+        node="format_results",
+        action=format_results,
+    )
+
+    ### Add edges ###
+    graph.add_edge(
+        start_key=START,
+        end_key="expand_queries",
+    )
+    graph.add_edge(
+        start_key="expand_queries",
+        end_key="format_queries",
+    )
+
+    graph.add_conditional_edges(
+        source="format_queries",
+        path=parallel_retrieval_edge,
+        path_map=["retrieve_documents"],
+    )
+    graph.add_edge(
+        start_key="retrieve_documents",
+        end_key="kickoff_verification",
+    )
+    graph.add_edge(
+        start_key="verify_documents",
+        end_key="rerank_documents",
+    )
+    graph.add_edge(
+        start_key="rerank_documents",
+        end_key="format_results",
+    )
+    graph.add_edge(
+        start_key="format_results",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    from onyx.db.engine import get_session_context_manager
+    from onyx.llm.factory import get_default_llms
+    from onyx.context.search.models import SearchRequest
+
+    graph = expanded_retrieval_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+    search_request = SearchRequest(
+        query="what can you do with onyx or danswer?",
+    )
+
+    with get_session_context_manager() as db_session:
+        graph_config, search_tool = get_test_config(
+            db_session, primary_llm, fast_llm, search_request
+        )
+        inputs = ExpandedRetrievalInput(
+            question="what can you do with onyx?",
+            base_search=False,
+            sub_question_id=None,
+            log_messages=[],
+        )
+        for thing in compiled_graph.stream(
+            input=inputs,
+            config={"configurable": {"config": graph_config}},
+            stream_mode="custom",
+            subgraphs=True,
+        ):
+            logger.debug(thing)
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/models.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/models.py
@@ -0,0 +1,13 @@
+from pydantic import BaseModel
+
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.context.search.models import InferenceSection
+
+
+class QuestionRetrievalResult(BaseModel):
+    expanded_query_results: list[QueryRetrievalResult] = []
+    retrieved_documents: list[InferenceSection] = []
+    verified_reranked_documents: list[InferenceSection] = []
+    context_documents: list[InferenceSection] = []
+    retrieval_stats: AgentChunkRetrievalStats = AgentChunkRetrievalStats()
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/expand_queries.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/expand_queries.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.operations import (
+    dispatch_subquery,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    QueryExpansionUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.utils import dispatch_separated
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.prompts.agent_search import (
+    QUERY_REWRITING_PROMPT,
+)
+
+
+def expand_queries(
+    state: ExpandedRetrievalInput,
+    config: RunnableConfig,
+    writer: StreamWriter,
+) -> QueryExpansionUpdate:
+    """
+    LangGraph node to expand a question into multiple search queries.
+    """
+    # Sometimes we want to expand the original question, sometimes we want to expand a sub-question.
+    # When we are running this node on the original question, no question is explictly passed in.
+    # Instead, we use the original question from the search request.
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    node_start_time = datetime.now()
+    question = state.question
+
+    llm = graph_config.tooling.fast_llm
+    sub_question_id = state.sub_question_id
+    if sub_question_id is None:
+        level, question_num = 0, 0
+    else:
+        level, question_num = parse_question_id(sub_question_id)
+
+    msg = [
+        HumanMessage(
+            content=QUERY_REWRITING_PROMPT.format(question=question),
+        )
+    ]
+
+    llm_response_list = dispatch_separated(
+        llm.stream(prompt=msg), dispatch_subquery(level, question_num, writer)
+    )
+
+    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content
+
+    rewritten_queries = llm_response.split("\n")
+
+    return QueryExpansionUpdate(
+        expanded_queries=rewritten_queries,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="shared - expanded retrieval",
+                node_name="expand queries",
+                node_start_time=node_start_time,
+                result=f"Number of expanded queries: {len(rewritten_queries)}",
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/format_queries.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/format_queries.py
@@ -0,0 +1,19 @@
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    QueryExpansionUpdate,
+)
+
+
+def format_queries(
+    state: ExpandedRetrievalState, config: RunnableConfig
+) -> QueryExpansionUpdate:
+    """
+    LangGraph node to format the expanded queries into a list of strings.
+    """
+    return QueryExpansionUpdate(
+        expanded_queries=state.expanded_queries,
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/format_results.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/format_results.py
@@ -0,0 +1,91 @@
+from typing import cast
+
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.deep_search.main.operations import get_query_info
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.models import (
+    QuestionRetrievalResult,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.operations import (
+    calculate_sub_question_retrieval_stats,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalState,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.agents.agent_search.shared_graph_utils.utils import relevance_from_docs
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import ExtendedToolResponse
+from onyx.tools.tool_implementations.search.search_tool import yield_search_responses
+
+
+def format_results(
+    state: ExpandedRetrievalState,
+    config: RunnableConfig,
+    writer: StreamWriter,
+) -> ExpandedRetrievalUpdate:
+    """
+    LangGraph node that constructs the proper expanded retrieval format.
+    """
+    level, question_num = parse_question_id(state.sub_question_id or "0_0")
+    query_info = get_query_info(state.query_retrieval_results)
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+
+    # Main question docs will be sent later after aggregation and deduping with sub-question docs
+    reranked_documents = state.reranked_documents
+
+    if not (level == 0 and question_num == 0):
+        if len(reranked_documents) == 0:
+            # The sub-question is used as the last query. If no verified documents are found, stream
+            # the top 3 for that one. We may want to revisit this.
+            reranked_documents = state.query_retrieval_results[-1].retrieved_documents[
+                :3
+            ]
+
+        assert (
+            graph_config.tooling.search_tool
+        ), "search_tool must be provided for agentic search"
+
+        relevance_list = relevance_from_docs(reranked_documents)
+        for tool_response in yield_search_responses(
+            query=state.question,
+            reranked_sections=state.retrieved_documents,
+            final_context_sections=reranked_documents,
+            search_query_info=query_info,
+            get_section_relevance=lambda: relevance_list,
+            search_tool=graph_config.tooling.search_tool,
+        ):
+            write_custom_event(
+                "tool_response",
+                ExtendedToolResponse(
+                    id=tool_response.id,
+                    response=tool_response.response,
+                    level=level,
+                    level_question_num=question_num,
+                ),
+                writer,
+            )
+    sub_question_retrieval_stats = calculate_sub_question_retrieval_stats(
+        verified_documents=state.verified_documents,
+        expanded_retrieval_results=state.query_retrieval_results,
+    )
+
+    if sub_question_retrieval_stats is None:
+        sub_question_retrieval_stats = AgentChunkRetrievalStats()
+
+    return ExpandedRetrievalUpdate(
+        expanded_retrieval_result=QuestionRetrievalResult(
+            expanded_query_results=state.query_retrieval_results,
+            retrieved_documents=state.retrieved_documents,
+            verified_reranked_documents=reranked_documents,
+            context_documents=state.reranked_documents,
+            retrieval_stats=sub_question_retrieval_stats,
+        ),
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/kickoff_verification.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/kickoff_verification.py
@@ -0,0 +1,44 @@
+from typing import Literal
+
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import Command
+from langgraph.types import Send
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    DocVerificationInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalState,
+)
+
+
+def kickoff_verification(
+    state: ExpandedRetrievalState,
+    config: RunnableConfig,
+) -> Command[Literal["verify_documents"]]:
+    """
+    LangGraph node (Command node!) that kicks off the verification process for the retrieved documents.
+    Note that this is a Command node and does the routing as well. (At present, no state updates
+    are done here, so this could be replaced with an edge. But we may choose to make state
+    updates later.)
+    """
+    retrieved_documents = state.retrieved_documents
+    verification_question = state.question
+
+    sub_question_id = state.sub_question_id
+    return Command(
+        update={},
+        goto=[
+            Send(
+                node="verify_documents",
+                arg=DocVerificationInput(
+                    retrieved_document_to_verify=document,
+                    question=verification_question,
+                    base_search=False,
+                    sub_question_id=sub_question_id,
+                    log_messages=[],
+                ),
+            )
+            for document in retrieved_documents
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/rerank_documents.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/rerank_documents.py
@@ -0,0 +1,105 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.operations import (
+    logger,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    DocRerankingUpdate,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    ExpandedRetrievalState,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.calculations import get_fit_scores
+from onyx.agents.agent_search.shared_graph_utils.models import RetrievalFitStats
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.configs.agent_configs import AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS
+from onyx.configs.agent_configs import AGENT_RERANKING_STATS
+from onyx.context.search.models import InferenceSection
+from onyx.context.search.models import SearchRequest
+from onyx.context.search.pipeline import retrieval_preprocessing
+from onyx.context.search.postprocessing.postprocessing import rerank_sections
+from onyx.db.engine import get_session_context_manager
+
+
+def rerank_documents(
+    state: ExpandedRetrievalState, config: RunnableConfig
+) -> DocRerankingUpdate:
+    """
+    LangGraph node to rerank the retrieved and verified documents. A part of the
+    pre-existing pipeline is used here.
+    """
+    node_start_time = datetime.now()
+    verified_documents = state.verified_documents
+
+    # Rerank post retrieval and verification. First, create a search query
+    # then create the list of reranked sections
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    question = (
+        state.question if state.question else graph_config.inputs.search_request.query
+    )
+    assert (
+        graph_config.tooling.search_tool
+    ), "search_tool must be provided for agentic search"
+    with get_session_context_manager() as db_session:
+        # we ignore some of the user specified fields since this search is
+        # internal to agentic search, but we still want to pass through
+        # persona (for stuff like document sets) and rerank settings
+        # (to not make an unnecessary db call).
+        search_request = SearchRequest(
+            query=question,
+            persona=graph_config.inputs.search_request.persona,
+            rerank_settings=graph_config.inputs.search_request.rerank_settings,
+        )
+        _search_query = retrieval_preprocessing(
+            search_request=search_request,
+            user=graph_config.tooling.search_tool.user,  # bit of a hack
+            llm=graph_config.tooling.fast_llm,
+            db_session=db_session,
+        )
+
+    # skip section filtering
+
+    if (
+        _search_query.rerank_settings
+        and _search_query.rerank_settings.rerank_model_name
+        and _search_query.rerank_settings.num_rerank > 0
+        and len(verified_documents) > 0
+    ):
+        if len(verified_documents) > 1:
+            reranked_documents = rerank_sections(
+                _search_query,
+                verified_documents,
+            )
+        else:
+            num = "No" if len(verified_documents) == 0 else "One"
+            logger.warning(f"{num} verified document(s) found, skipping reranking")
+            reranked_documents = verified_documents
+    else:
+        logger.warning("No reranking settings found, using unranked documents")
+        reranked_documents = verified_documents
+
+    if AGENT_RERANKING_STATS:
+        fit_scores = get_fit_scores(verified_documents, reranked_documents)
+    else:
+        fit_scores = RetrievalFitStats(fit_score_lift=0, rerank_effect=0, fit_scores={})
+
+    return DocRerankingUpdate(
+        reranked_documents=[
+            doc for doc in reranked_documents if type(doc) == InferenceSection
+        ][:AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS],
+        sub_question_retrieval_stats=fit_scores,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="shared - expanded retrieval",
+                node_name="rerank documents",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/retrieve_documents.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/retrieve_documents.py
@@ -0,0 +1,113 @@
+from datetime import datetime
+from typing import cast
+
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.operations import (
+    logger,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    DocRetrievalUpdate,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    RetrievalInput,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.calculations import get_fit_scores
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_langgraph_node_log_string,
+)
+from onyx.configs.agent_configs import AGENT_MAX_QUERY_RETRIEVAL_RESULTS
+from onyx.configs.agent_configs import AGENT_RETRIEVAL_STATS
+from onyx.context.search.models import InferenceSection
+from onyx.db.engine import get_session_context_manager
+from onyx.tools.models import SearchQueryInfo
+from onyx.tools.tool_implementations.search.search_tool import (
+    SEARCH_RESPONSE_SUMMARY_ID,
+)
+from onyx.tools.tool_implementations.search.search_tool import SearchResponseSummary
+
+
+def retrieve_documents(
+    state: RetrievalInput, config: RunnableConfig
+) -> DocRetrievalUpdate:
+    """
+    LangGraph node to retrieve documents from the search tool.
+    """
+    node_start_time = datetime.now()
+    query_to_retrieve = state.query_to_retrieve
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    search_tool = graph_config.tooling.search_tool
+
+    retrieved_docs: list[InferenceSection] = []
+    if not query_to_retrieve.strip():
+        logger.warning("Empty query, skipping retrieval")
+
+        return DocRetrievalUpdate(
+            query_retrieval_results=[],
+            retrieved_documents=[],
+            log_messages=[
+                get_langgraph_node_log_string(
+                    graph_component="shared - expanded retrieval",
+                    node_name="retrieve documents",
+                    node_start_time=node_start_time,
+                    result="Empty query, skipping retrieval",
+                )
+            ],
+        )
+
+    query_info = None
+    if search_tool is None:
+        raise ValueError("search_tool must be provided for agentic search")
+
+    callback_container: list[list[InferenceSection]] = []
+
+    # new db session to avoid concurrency issues
+    with get_session_context_manager() as db_session:
+        for tool_response in search_tool.run(
+            query=query_to_retrieve,
+            force_no_rerank=True,
+            alternate_db_session=db_session,
+            retrieved_sections_callback=callback_container.append,
+        ):
+            # get retrieved docs to send to the rest of the graph
+            if tool_response.id == SEARCH_RESPONSE_SUMMARY_ID:
+                response = cast(SearchResponseSummary, tool_response.response)
+                retrieved_docs = response.top_sections
+                query_info = SearchQueryInfo(
+                    predicted_search=response.predicted_search,
+                    final_filters=response.final_filters,
+                    recency_bias_multiplier=response.recency_bias_multiplier,
+                )
+                break
+
+    retrieved_docs = retrieved_docs[:AGENT_MAX_QUERY_RETRIEVAL_RESULTS]
+
+    if AGENT_RETRIEVAL_STATS:
+        pre_rerank_docs = callback_container[0]
+        fit_scores = get_fit_scores(
+            pre_rerank_docs,
+            retrieved_docs,
+        )
+    else:
+        fit_scores = None
+
+    expanded_retrieval_result = QueryRetrievalResult(
+        query=query_to_retrieve,
+        retrieved_documents=retrieved_docs,
+        stats=fit_scores,
+        query_info=query_info,
+    )
+
+    return DocRetrievalUpdate(
+        query_retrieval_results=[expanded_retrieval_result],
+        retrieved_documents=retrieved_docs,
+        log_messages=[
+            get_langgraph_node_log_string(
+                graph_component="shared - expanded retrieval",
+                node_name="retrieve documents",
+                node_start_time=node_start_time,
+            )
+        ],
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/verify_documents.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/verify_documents.py
@@ -0,0 +1,62 @@
+from typing import cast
+
+from langchain_core.messages import HumanMessage
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    DocVerificationInput,
+)
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.states import (
+    DocVerificationUpdate,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
+    trim_prompt_piece,
+)
+from onyx.prompts.agent_search import (
+    DOCUMENT_VERIFICATION_PROMPT,
+)
+
+
+def verify_documents(
+    state: DocVerificationInput, config: RunnableConfig
+) -> DocVerificationUpdate:
+    """
+    LangGraph node to check whether the document is relevant for the original user question
+
+    Args:
+        state (DocVerificationInput): The current state
+        config (RunnableConfig): Configuration containing ProSearchConfig
+
+    Updates:
+        verified_documents: list[InferenceSection]
+    """
+
+    question = state.question
+    retrieved_document_to_verify = state.retrieved_document_to_verify
+    document_content = retrieved_document_to_verify.combined_content
+
+    graph_config = cast(GraphConfig, config["metadata"]["config"])
+    fast_llm = graph_config.tooling.fast_llm
+
+    document_content = trim_prompt_piece(
+        fast_llm.config, document_content, DOCUMENT_VERIFICATION_PROMPT + question
+    )
+
+    msg = [
+        HumanMessage(
+            content=DOCUMENT_VERIFICATION_PROMPT.format(
+                question=question, document_content=document_content
+            )
+        )
+    ]
+
+    response = fast_llm.invoke(msg)
+
+    verified_documents = []
+    if isinstance(response.content, str) and "yes" in response.content.lower():
+        verified_documents.append(retrieved_document_to_verify)
+
+    return DocVerificationUpdate(
+        verified_documents=verified_documents,
+    )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/operations.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/operations.py
@@ -0,0 +1,93 @@
+from collections import defaultdict
+from collections.abc import Callable
+
+import numpy as np
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.shared_graph_utils.models import AgentChunkRetrievalStats
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import SubQueryPiece
+from onyx.context.search.models import InferenceSection
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def dispatch_subquery(
+    level: int, question_num: int, writer: StreamWriter
+) -> Callable[[str, int], None]:
+    def helper(token: str, num: int) -> None:
+        write_custom_event(
+            "subqueries",
+            SubQueryPiece(
+                sub_query=token,
+                level=level,
+                level_question_num=question_num,
+                query_id=num,
+            ),
+            writer,
+        )
+
+    return helper
+
+
+def calculate_sub_question_retrieval_stats(
+    verified_documents: list[InferenceSection],
+    expanded_retrieval_results: list[QueryRetrievalResult],
+) -> AgentChunkRetrievalStats:
+    chunk_scores: dict[str, dict[str, list[int | float]]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+
+    for expanded_retrieval_result in expanded_retrieval_results:
+        for doc in expanded_retrieval_result.retrieved_documents:
+            doc_chunk_id = f"{doc.center_chunk.document_id}_{doc.center_chunk.chunk_id}"
+            if doc.center_chunk.score is not None:
+                chunk_scores[doc_chunk_id]["score"].append(doc.center_chunk.score)
+
+    verified_doc_chunk_ids = [
+        f"{verified_document.center_chunk.document_id}_{verified_document.center_chunk.chunk_id}"
+        for verified_document in verified_documents
+    ]
+    dismissed_doc_chunk_ids = []
+
+    raw_chunk_stats_counts: dict[str, int] = defaultdict(int)
+    raw_chunk_stats_scores: dict[str, float] = defaultdict(float)
+    for doc_chunk_id, chunk_data in chunk_scores.items():
+        valid_chunk_scores = [
+            score for score in chunk_data["score"] if score is not None
+        ]
+        key = "verified" if doc_chunk_id in verified_doc_chunk_ids else "rejected"
+        raw_chunk_stats_counts[f"{key}_count"] += 1
+
+        raw_chunk_stats_scores[f"{key}_scores"] += float(np.mean(valid_chunk_scores))
+
+        if key == "rejected":
+            dismissed_doc_chunk_ids.append(doc_chunk_id)
+
+    if raw_chunk_stats_counts["verified_count"] == 0:
+        verified_avg_scores = 0.0
+    else:
+        verified_avg_scores = raw_chunk_stats_scores["verified_scores"] / float(
+            raw_chunk_stats_counts["verified_count"]
+        )
+
+    rejected_scores = raw_chunk_stats_scores.get("rejected_scores")
+    if rejected_scores is not None:
+        rejected_avg_scores = rejected_scores / float(
+            raw_chunk_stats_counts["rejected_count"]
+        )
+    else:
+        rejected_avg_scores = None
+
+    chunk_stats = AgentChunkRetrievalStats(
+        verified_count=raw_chunk_stats_counts["verified_count"],
+        verified_avg_scores=verified_avg_scores,
+        rejected_count=raw_chunk_stats_counts["rejected_count"],
+        rejected_avg_scores=rejected_avg_scores,
+        verified_doc_chunk_ids=verified_doc_chunk_ids,
+        dismissed_doc_chunk_ids=dismissed_doc_chunk_ids,
+    )
+
+    return chunk_stats
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/states.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/states.py
@@ -0,0 +1,91 @@
+from operator import add
+from typing import Annotated
+
+from pydantic import BaseModel
+
+from onyx.agents.agent_search.core_state import SubgraphCoreState
+from onyx.agents.agent_search.deep_search.main.states import LoggerUpdate
+from onyx.agents.agent_search.deep_search.shared.expanded_retrieval.models import (
+    QuestionRetrievalResult,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import QueryRetrievalResult
+from onyx.agents.agent_search.shared_graph_utils.models import RetrievalFitStats
+from onyx.agents.agent_search.shared_graph_utils.operators import (
+    dedup_inference_sections,
+)
+from onyx.context.search.models import InferenceSection
+
+### States ###
+
+## Graph Input State
+
+
+class ExpandedRetrievalInput(SubgraphCoreState):
+    question: str = ""
+    base_search: bool = False
+    sub_question_id: str | None = None
+
+
+## Update/Return States
+
+
+class QueryExpansionUpdate(LoggerUpdate, BaseModel):
+    expanded_queries: list[str] = []
+    log_messages: list[str] = []
+
+
+class DocVerificationUpdate(BaseModel):
+    verified_documents: Annotated[list[InferenceSection], dedup_inference_sections] = []
+
+
+class DocRetrievalUpdate(LoggerUpdate, BaseModel):
+    query_retrieval_results: Annotated[list[QueryRetrievalResult], add] = []
+    retrieved_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ] = []
+
+
+class DocRerankingUpdate(LoggerUpdate, BaseModel):
+    reranked_documents: Annotated[list[InferenceSection], dedup_inference_sections] = []
+    sub_question_retrieval_stats: RetrievalFitStats | None = None
+
+
+class ExpandedRetrievalUpdate(LoggerUpdate, BaseModel):
+    expanded_retrieval_result: QuestionRetrievalResult
+
+
+## Graph Output State
+
+
+class ExpandedRetrievalOutput(LoggerUpdate, BaseModel):
+    expanded_retrieval_result: QuestionRetrievalResult = QuestionRetrievalResult()
+    base_expanded_retrieval_result: QuestionRetrievalResult = QuestionRetrievalResult()
+    retrieved_documents: Annotated[
+        list[InferenceSection], dedup_inference_sections
+    ] = []
+
+
+## Graph State
+
+
+class ExpandedRetrievalState(
+    # This includes the core state
+    ExpandedRetrievalInput,
+    QueryExpansionUpdate,
+    DocRetrievalUpdate,
+    DocVerificationUpdate,
+    DocRerankingUpdate,
+    ExpandedRetrievalOutput,
+):
+    pass
+
+
+## Conditional Input States
+
+
+class DocVerificationInput(ExpandedRetrievalInput):
+    retrieved_document_to_verify: InferenceSection
+
+
+class RetrievalInput(ExpandedRetrievalInput):
+    query_to_retrieve: str = ""
--- a/backend/onyx/agents/agent_search/models.py
+++ b/backend/onyx/agents/agent_search/models.py
@@ -0,0 +1,90 @@
+from uuid import UUID
+
+from pydantic import BaseModel
+from pydantic import model_validator
+from sqlalchemy.orm import Session
+
+from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
+from onyx.context.search.models import SearchRequest
+from onyx.file_store.utils import InMemoryChatFile
+from onyx.llm.interfaces import LLM
+from onyx.tools.force import ForceUseTool
+from onyx.tools.tool import Tool
+from onyx.tools.tool_implementations.search.search_tool import SearchTool
+
+
+class GraphInputs(BaseModel):
+    """Input data required for the graph execution"""
+
+    search_request: SearchRequest
+    prompt_builder: AnswerPromptBuilder
+    files: list[InMemoryChatFile] | None = None
+    structured_response_format: dict | None = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class GraphTooling(BaseModel):
+    """Tools and LLMs available to the graph"""
+
+    primary_llm: LLM
+    fast_llm: LLM
+    search_tool: SearchTool | None = None
+    tools: list[Tool]
+    # Whether to force use of a tool, or to
+    # force tool args IF the tool is used
+    force_use_tool: ForceUseTool
+    using_tool_calling_llm: bool = False
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class GraphPersistence(BaseModel):
+    """Configuration for data persistence"""
+
+    chat_session_id: UUID
+    # The message ID of the to-be-created first agent message
+    # in response to the user message that triggered the Pro Search
+    message_id: int
+
+    # The database session the user and initial agent
+    # message were flushed to; only needed for agentic search
+    db_session: Session
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class GraphSearchConfig(BaseModel):
+    """Configuration controlling search behavior"""
+
+    use_agentic_search: bool = False
+    # Whether to perform initial search to inform decomposition
+    perform_initial_search_decomposition: bool = True
+
+    # Whether to allow creation of refinement questions (and entity extraction, etc.)
+    allow_refinement: bool = True
+    skip_gen_ai_answer_generation: bool = False
+
+
+class GraphConfig(BaseModel):
+    """
+    Main container for data needed for Langgraph execution
+    """
+
+    inputs: GraphInputs
+    tooling: GraphTooling
+    behavior: GraphSearchConfig
+    # Only needed for agentic search
+    persistence: GraphPersistence
+
+    @model_validator(mode="after")
+    def validate_search_tool(self) -> "GraphConfig":
+        if self.behavior.use_agentic_search and self.tooling.search_tool is None:
+            raise ValueError("search_tool must be provided for agentic search")
+        return self
+
+    class Config:
+        arbitrary_types_allowed = True
--- a/backend/onyx/agents/agent_search/orchestration/nodes/basic_use_tool_response.py
+++ b/backend/onyx/agents/agent_search/orchestration/nodes/basic_use_tool_response.py
@@ -0,0 +1,76 @@
+from typing import cast
+
+from langchain_core.messages import AIMessageChunk
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.basic.states import BasicOutput
+from onyx.agents.agent_search.basic.states import BasicState
+from onyx.agents.agent_search.basic.utils import process_llm_stream
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.chat.models import LlmDoc
+from onyx.tools.tool_implementations.search.search_tool import (
+    SEARCH_DOC_CONTENT_ID,
+)
+from onyx.tools.tool_implementations.search_like_tool_utils import (
+    FINAL_CONTEXT_DOCUMENTS_ID,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def basic_use_tool_response(
+    state: BasicState, config: RunnableConfig, writer: StreamWriter = lambda _: None
+) -> BasicOutput:
+    agent_config = cast(GraphConfig, config["metadata"]["config"])
+    structured_response_format = agent_config.inputs.structured_response_format
+    llm = agent_config.tooling.primary_llm
+    tool_choice = state.tool_choice
+    if tool_choice is None:
+        raise ValueError("Tool choice is None")
+    tool = tool_choice.tool
+    prompt_builder = agent_config.inputs.prompt_builder
+    if state.tool_call_output is None:
+        raise ValueError("Tool call output is None")
+    tool_call_output = state.tool_call_output
+    tool_call_summary = tool_call_output.tool_call_summary
+    tool_call_responses = tool_call_output.tool_call_responses
+
+    new_prompt_builder = tool.build_next_prompt(
+        prompt_builder=prompt_builder,
+        tool_call_summary=tool_call_summary,
+        tool_responses=tool_call_responses,
+        using_tool_calling_llm=agent_config.tooling.using_tool_calling_llm,
+    )
+
+    final_search_results = []
+    initial_search_results = []
+    for yield_item in tool_call_responses:
+        if yield_item.id == FINAL_CONTEXT_DOCUMENTS_ID:
+            final_search_results = cast(list[LlmDoc], yield_item.response)
+        elif yield_item.id == SEARCH_DOC_CONTENT_ID:
+            search_contexts = yield_item.response.contexts
+            for doc in search_contexts:
+                if doc.document_id not in initial_search_results:
+                    initial_search_results.append(doc)
+
+            initial_search_results = cast(list[LlmDoc], initial_search_results)
+
+    new_tool_call_chunk = AIMessageChunk(content="")
+    if not agent_config.behavior.skip_gen_ai_answer_generation:
+        stream = llm.stream(
+            prompt=new_prompt_builder.build(),
+            structured_response_format=structured_response_format,
+        )
+
+        # For now, we don't do multiple tool calls, so we ignore the tool_message
+        new_tool_call_chunk = process_llm_stream(
+            stream,
+            True,
+            writer,
+            final_search_results=final_search_results,
+            displayed_search_results=initial_search_results,
+        )
+
+    return BasicOutput(tool_call_chunk=new_tool_call_chunk)
--- a/backend/onyx/agents/agent_search/orchestration/nodes/llm_tool_choice.py
+++ b/backend/onyx/agents/agent_search/orchestration/nodes/llm_tool_choice.py
@@ -0,0 +1,152 @@
+from typing import cast
+from uuid import uuid4
+
+from langchain_core.messages import ToolCall
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.basic.utils import process_llm_stream
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.orchestration.states import ToolChoice
+from onyx.agents.agent_search.orchestration.states import ToolChoiceState
+from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
+from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
+from onyx.chat.tool_handling.tool_response_handler import get_tool_by_name
+from onyx.chat.tool_handling.tool_response_handler import (
+    get_tool_call_for_non_tool_calling_llm_impl,
+)
+from onyx.tools.tool import Tool
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+# TODO: break this out into an implementation function
+# and a function that handles extracting the necessary fields
+# from the state and config
+# TODO: fan-out to multiple tool call nodes? Make this configurable?
+def llm_tool_choice(
+    state: ToolChoiceState, config: RunnableConfig, writer: StreamWriter
+) -> ToolChoiceUpdate:
+    """
+    This node is responsible for calling the LLM to choose a tool. If no tool is chosen,
+    The node MAY emit an answer, depending on whether state["should_stream_answer"] is set.
+    """
+    should_stream_answer = state.should_stream_answer
+
+    agent_config = cast(GraphConfig, config["metadata"]["config"])
+    using_tool_calling_llm = agent_config.tooling.using_tool_calling_llm
+    prompt_builder = state.prompt_snapshot or agent_config.inputs.prompt_builder
+
+    llm = agent_config.tooling.primary_llm
+    skip_gen_ai_answer_generation = agent_config.behavior.skip_gen_ai_answer_generation
+
+    structured_response_format = agent_config.inputs.structured_response_format
+    tools = [
+        tool for tool in (agent_config.tooling.tools or []) if tool.name in state.tools
+    ]
+    force_use_tool = agent_config.tooling.force_use_tool
+
+    tool, tool_args = None, None
+    if force_use_tool.force_use and force_use_tool.args is not None:
+        tool_name, tool_args = (
+            force_use_tool.tool_name,
+            force_use_tool.args,
+        )
+        tool = get_tool_by_name(tools, tool_name)
+
+    # special pre-logic for non-tool calling LLM case
+    elif not using_tool_calling_llm and tools:
+        chosen_tool_and_args = get_tool_call_for_non_tool_calling_llm_impl(
+            force_use_tool=force_use_tool,
+            tools=tools,
+            prompt_builder=prompt_builder,
+            llm=llm,
+        )
+        if chosen_tool_and_args:
+            tool, tool_args = chosen_tool_and_args
+
+    # If we have a tool and tool args, we are ready to request a tool call.
+    # This only happens if the tool call was forced or we are using a non-tool calling LLM.
+    if tool and tool_args:
+        return ToolChoiceUpdate(
+            tool_choice=ToolChoice(
+                tool=tool,
+                tool_args=tool_args,
+                id=str(uuid4()),
+            ),
+        )
+
+    # if we're skipping gen ai answer generation, we should only
+    # continue if we're forcing a tool call (which will be emitted by
+    # the tool calling llm in the stream() below)
+    if skip_gen_ai_answer_generation and not force_use_tool.force_use:
+        return ToolChoiceUpdate(
+            tool_choice=None,
+        )
+
+    built_prompt = (
+        prompt_builder.build()
+        if isinstance(prompt_builder, AnswerPromptBuilder)
+        else prompt_builder.built_prompt
+    )
+    # At this point, we are either using a tool calling LLM or we are skipping the tool call.
+    # DEBUG: good breakpoint
+    stream = llm.stream(
+        # For tool calling LLMs, we want to insert the task prompt as part of this flow, this is because the LLM
+        # may choose to not call any tools and just generate the answer, in which case the task prompt is needed.
+        prompt=built_prompt,
+        tools=[tool.tool_definition() for tool in tools] or None,
+        tool_choice=("required" if tools and force_use_tool.force_use else None),
+        structured_response_format=structured_response_format,
+    )
+
+    tool_message = process_llm_stream(
+        stream,
+        should_stream_answer
+        and not agent_config.behavior.skip_gen_ai_answer_generation,
+        writer,
+    )
+
+    # If no tool calls are emitted by the LLM, we should not choose a tool
+    if len(tool_message.tool_calls) == 0:
+        logger.debug("No tool calls emitted by LLM")
+        return ToolChoiceUpdate(
+            tool_choice=None,
+        )
+
+    # TODO: here we could handle parallel tool calls. Right now
+    # we just pick the first one that matches.
+    selected_tool: Tool | None = None
+    selected_tool_call_request: ToolCall | None = None
+    for tool_call_request in tool_message.tool_calls:
+        known_tools_by_name = [
+            tool for tool in tools if tool.name == tool_call_request["name"]
+        ]
+
+        if known_tools_by_name:
+            selected_tool = known_tools_by_name[0]
+            selected_tool_call_request = tool_call_request
+            break
+
+        logger.error(
+            "Tool call requested with unknown name field. \n"
+            f"tools: {tools}"
+            f"tool_call_request: {tool_call_request}"
+        )
+
+    if not selected_tool or not selected_tool_call_request:
+        raise ValueError(
+            f"Tool call attempted with tool {selected_tool}, request {selected_tool_call_request}"
+        )
+
+    logger.debug(f"Selected tool: {selected_tool.name}")
+    logger.debug(f"Selected tool call request: {selected_tool_call_request}")
+
+    return ToolChoiceUpdate(
+        tool_choice=ToolChoice(
+            tool=selected_tool,
+            tool_args=selected_tool_call_request["args"],
+            id=selected_tool_call_request["id"],
+        ),
+    )
--- a/backend/onyx/agents/agent_search/orchestration/nodes/prepare_tool_input.py
+++ b/backend/onyx/agents/agent_search/orchestration/nodes/prepare_tool_input.py
@@ -0,0 +1,17 @@
+from typing import Any
+from typing import cast
+
+from langchain_core.runnables.config import RunnableConfig
+
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.orchestration.states import ToolChoiceInput
+
+
+def prepare_tool_input(state: Any, config: RunnableConfig) -> ToolChoiceInput:
+    agent_config = cast(GraphConfig, config["metadata"]["config"])
+    return ToolChoiceInput(
+        # NOTE: this node is used at the top level of the agent, so we always stream
+        should_stream_answer=True,
+        prompt_snapshot=None,  # uses default prompt builder
+        tools=[tool.name for tool in (agent_config.tooling.tools or [])],
+    )
--- a/backend/onyx/agents/agent_search/orchestration/nodes/tool_call.py
+++ b/backend/onyx/agents/agent_search/orchestration/nodes/tool_call.py
@@ -0,0 +1,70 @@
+from typing import cast
+
+from langchain_core.messages import AIMessageChunk
+from langchain_core.messages.tool import ToolCall
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.types import StreamWriter
+
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.orchestration.states import ToolCallOutput
+from onyx.agents.agent_search.orchestration.states import ToolCallUpdate
+from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
+from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
+from onyx.chat.models import AnswerPacket
+from onyx.tools.message import build_tool_message
+from onyx.tools.message import ToolCallSummary
+from onyx.tools.tool_runner import ToolRunner
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+
+def emit_packet(packet: AnswerPacket, writer: StreamWriter) -> None:
+    write_custom_event("basic_response", packet, writer)
+
+
+def tool_call(
+    state: ToolChoiceUpdate,
+    config: RunnableConfig,
+    writer: StreamWriter,
+) -> ToolCallUpdate:
+    """Calls the tool specified in the state and updates the state with the result"""
+
+    cast(GraphConfig, config["metadata"]["config"])
+
+    tool_choice = state.tool_choice
+    if tool_choice is None:
+        raise ValueError("Cannot invoke tool call node without a tool choice")
+
+    tool = tool_choice.tool
+    tool_args = tool_choice.tool_args
+    tool_id = tool_choice.id
+    tool_runner = ToolRunner(tool, tool_args)
+    tool_kickoff = tool_runner.kickoff()
+
+    emit_packet(tool_kickoff, writer)
+
+    tool_responses = []
+    for response in tool_runner.tool_responses():
+        tool_responses.append(response)
+        emit_packet(response, writer)
+
+    tool_final_result = tool_runner.tool_final_result()
+    emit_packet(tool_final_result, writer)
+
+    tool_call = ToolCall(name=tool.name, args=tool_args, id=tool_id)
+    tool_call_summary = ToolCallSummary(
+        tool_call_request=AIMessageChunk(content="", tool_calls=[tool_call]),
+        tool_call_result=build_tool_message(
+            tool_call, tool_runner.tool_message_content()
+        ),
+    )
+
+    tool_call_output = ToolCallOutput(
+        tool_call_summary=tool_call_summary,
+        tool_call_kickoff=tool_kickoff,
+        tool_call_responses=tool_responses,
+        tool_call_final_result=tool_final_result,
+    )
+    return ToolCallUpdate(tool_call_output=tool_call_output)
--- a/backend/onyx/agents/agent_search/orchestration/states.py
+++ b/backend/onyx/agents/agent_search/orchestration/states.py
@@ -0,0 +1,48 @@
+from pydantic import BaseModel
+
+from onyx.chat.prompt_builder.answer_prompt_builder import PromptSnapshot
+from onyx.tools.message import ToolCallSummary
+from onyx.tools.models import ToolCallFinalResult
+from onyx.tools.models import ToolCallKickoff
+from onyx.tools.models import ToolResponse
+from onyx.tools.tool import Tool
+
+
+# TODO: adapt the tool choice/tool call to allow for parallel tool calls by
+# creating a subgraph that can be invoked in parallel via Send/Command APIs
+class ToolChoiceInput(BaseModel):
+    should_stream_answer: bool = True
+    # default to the prompt builder from the config, but
+    # allow overrides for arbitrary tool calls
+    prompt_snapshot: PromptSnapshot | None = None
+
+    # names of tools to use for tool calling. Filters the tools available in the config
+    tools: list[str] = []
+
+
+class ToolCallOutput(BaseModel):
+    tool_call_summary: ToolCallSummary
+    tool_call_kickoff: ToolCallKickoff
+    tool_call_responses: list[ToolResponse]
+    tool_call_final_result: ToolCallFinalResult
+
+
+class ToolCallUpdate(BaseModel):
+    tool_call_output: ToolCallOutput | None = None
+
+
+class ToolChoice(BaseModel):
+    tool: Tool
+    tool_args: dict
+    id: str | None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class ToolChoiceUpdate(BaseModel):
+    tool_choice: ToolChoice | None = None
+
+
+class ToolChoiceState(ToolChoiceUpdate, ToolChoiceInput):
+    pass
--- a/backend/onyx/agents/agent_search/run_graph.py
+++ b/backend/onyx/agents/agent_search/run_graph.py
@@ -0,0 +1,213 @@
+from collections.abc import Iterable
+from datetime import datetime
+from typing import cast
+
+from langchain_core.runnables.schema import CustomStreamEvent
+from langchain_core.runnables.schema import StreamEvent
+from langgraph.graph.state import CompiledStateGraph
+
+from onyx.agents.agent_search.basic.graph_builder import basic_graph_builder
+from onyx.agents.agent_search.basic.states import BasicInput
+from onyx.agents.agent_search.deep_search.main.graph_builder import (
+    main_graph_builder as main_graph_builder_a,
+)
+from onyx.agents.agent_search.deep_search.main.states import (
+    MainInput as MainInput_a,
+)
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.utils import get_test_config
+from onyx.chat.models import AgentAnswerPiece
+from onyx.chat.models import AnswerPacket
+from onyx.chat.models import AnswerStream
+from onyx.chat.models import ExtendedToolResponse
+from onyx.chat.models import RefinedAnswerImprovement
+from onyx.chat.models import StreamStopInfo
+from onyx.chat.models import SubQueryPiece
+from onyx.chat.models import SubQuestionPiece
+from onyx.chat.models import ToolResponse
+from onyx.configs.agent_configs import ALLOW_REFINEMENT
+from onyx.configs.agent_configs import INITIAL_SEARCH_DECOMPOSITION_ENABLED
+from onyx.context.search.models import SearchRequest
+from onyx.db.engine import get_session_context_manager
+from onyx.llm.factory import get_default_llms
+from onyx.tools.tool_runner import ToolCallKickoff
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+_COMPILED_GRAPH: CompiledStateGraph | None = None
+
+
+def _parse_agent_event(
+    event: StreamEvent,
+) -> AnswerPacket | None:
+    """
+    Parse the event into a typed object.
+    Return None if we are not interested in the event.
+    """
+
+    event_type = event["event"]
+
+    # We always just yield the event data, but this piece is useful for two development reasons:
+    # 1. It's a list of the names of every place we dispatch a custom event
+    # 2. We maintain the intended types yielded by each event
+    if event_type == "on_custom_event":
+        if event["name"] == "decomp_qs":
+            return cast(SubQuestionPiece, event["data"])
+        elif event["name"] == "subqueries":
+            return cast(SubQueryPiece, event["data"])
+        elif event["name"] == "sub_answers":
+            return cast(AgentAnswerPiece, event["data"])
+        elif event["name"] == "stream_finished":
+            return cast(StreamStopInfo, event["data"])
+        elif event["name"] == "initial_agent_answer":
+            return cast(AgentAnswerPiece, event["data"])
+        elif event["name"] == "refined_agent_answer":
+            return cast(AgentAnswerPiece, event["data"])
+        elif event["name"] == "start_refined_answer_creation":
+            return cast(ToolCallKickoff, event["data"])
+        elif event["name"] == "tool_response":
+            return cast(ToolResponse, event["data"])
+        elif event["name"] == "basic_response":
+            return cast(AnswerPacket, event["data"])
+        elif event["name"] == "refined_answer_improvement":
+            return cast(RefinedAnswerImprovement, event["data"])
+    return None
+
+
+def manage_sync_streaming(
+    compiled_graph: CompiledStateGraph,
+    config: GraphConfig,
+    graph_input: BasicInput | MainInput_a,
+) -> Iterable[StreamEvent]:
+    message_id = config.persistence.message_id if config.persistence else None
+    for event in compiled_graph.stream(
+        stream_mode="custom",
+        input=graph_input,
+        config={"metadata": {"config": config, "thread_id": str(message_id)}},
+    ):
+        yield cast(CustomStreamEvent, event)
+
+
+def run_graph(
+    compiled_graph: CompiledStateGraph,
+    config: GraphConfig,
+    input: BasicInput | MainInput_a,
+) -> AnswerStream:
+    config.behavior.perform_initial_search_decomposition = (
+        INITIAL_SEARCH_DECOMPOSITION_ENABLED
+    )
+    config.behavior.allow_refinement = ALLOW_REFINEMENT
+
+    for event in manage_sync_streaming(
+        compiled_graph=compiled_graph, config=config, graph_input=input
+    ):
+        if not (parsed_object := _parse_agent_event(event)):
+            continue
+
+        yield parsed_object
+
+
+# It doesn't actually take very long to load the graph, but we'd rather
+# not compile it again on every request.
+def load_compiled_graph() -> CompiledStateGraph:
+    global _COMPILED_GRAPH
+    if _COMPILED_GRAPH is None:
+        graph = main_graph_builder_a()
+        _COMPILED_GRAPH = graph.compile()
+    return _COMPILED_GRAPH
+
+
+def run_main_graph(
+    config: GraphConfig,
+) -> AnswerStream:
+    compiled_graph = load_compiled_graph()
+
+    input = MainInput_a(
+        base_question=config.inputs.search_request.query, log_messages=[]
+    )
+
+    # Agent search is not a Tool per se, but this is helpful for the frontend
+    yield ToolCallKickoff(
+        tool_name="agent_search_0",
+        tool_args={"query": config.inputs.search_request.query},
+    )
+    yield from run_graph(compiled_graph, config, input)
+
+
+def run_basic_graph(
+    config: GraphConfig,
+) -> AnswerStream:
+    graph = basic_graph_builder()
+    compiled_graph = graph.compile()
+    input = BasicInput()
+    return run_graph(compiled_graph, config, input)
+
+
+if __name__ == "__main__":
+    for _ in range(1):
+        query_start_time = datetime.now()
+        logger.debug(f"Start at {query_start_time}")
+        graph = main_graph_builder_a()
+        compiled_graph = graph.compile()
+        query_end_time = datetime.now()
+        logger.debug(f"Graph compiled in {query_end_time - query_start_time} seconds")
+        primary_llm, fast_llm = get_default_llms()
+        search_request = SearchRequest(
+            # query="what can you do with gitlab?",
+            # query="What are the guiding principles behind the development of cockroachDB",
+            # query="What are the temperatures in Munich, Hawaii, and New York?",
+            # query="When was Washington born?",
+            # query="What is Onyx?",
+            # query="What is the difference between astronomy and astrology?",
+            query="Do a search to tell me what is the difference between astronomy and astrology?",
+        )
+
+        with get_session_context_manager() as db_session:
+            config = get_test_config(db_session, primary_llm, fast_llm, search_request)
+            assert (
+                config.persistence is not None
+            ), "set a chat session id to run this test"
+
+            # search_request.persona = get_persona_by_id(1, None, db_session)
+            # config.perform_initial_search_path_decision = False
+            config.behavior.perform_initial_search_decomposition = True
+            input = MainInput_a(
+                base_question=config.inputs.search_request.query, log_messages=[]
+            )
+
+            tool_responses: list = []
+            for output in run_graph(compiled_graph, config, input):
+                if isinstance(output, ToolCallKickoff):
+                    pass
+                elif isinstance(output, ExtendedToolResponse):
+                    tool_responses.append(output.response)
+                    logger.info(
+                        f"   ---- ET {output.level} - {output.level_question_num} |  "
+                    )
+                elif isinstance(output, SubQueryPiece):
+                    logger.info(
+                        f"Sq {output.level} - {output.level_question_num} - {output.sub_query} | "
+                    )
+                elif isinstance(output, SubQuestionPiece):
+                    logger.info(
+                        f"SQ {output.level} - {output.level_question_num} - {output.sub_question} | "
+                    )
+                elif (
+                    isinstance(output, AgentAnswerPiece)
+                    and output.answer_type == "agent_sub_answer"
+                ):
+                    logger.info(
+                        f"   ---- SA {output.level} - {output.level_question_num} {output.answer_piece} | "
+                    )
+                elif (
+                    isinstance(output, AgentAnswerPiece)
+                    and output.answer_type == "agent_level_answer"
+                ):
+                    logger.info(
+                        f"   ---------- FA {output.level} - {output.level_question_num}  {output.answer_piece} | "
+                    )
+                elif isinstance(output, RefinedAnswerImprovement):
+                    logger.info(
+                        f"   ---------- RE {output.refined_answer_improvement} | "
+                    )
--- a/backend/onyx/agents/agent_search/shared_graph_utils/agent_prompt_ops.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/agent_prompt_ops.py
@@ -0,0 +1,152 @@
+from langchain.schema import AIMessage
+from langchain.schema import HumanMessage
+from langchain.schema import SystemMessage
+from langchain_core.messages.tool import ToolMessage
+
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    AgentPromptEnrichmentComponents,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import (
+    get_persona_agent_prompt_expressions,
+)
+from onyx.agents.agent_search.shared_graph_utils.utils import remove_document_citations
+from onyx.agents.agent_search.shared_graph_utils.utils import summarize_history
+from onyx.configs.agent_configs import AGENT_MAX_STATIC_HISTORY_WORD_LENGTH
+from onyx.configs.constants import MessageType
+from onyx.context.search.models import InferenceSection
+from onyx.llm.interfaces import LLMConfig
+from onyx.llm.utils import get_max_input_tokens
+from onyx.natural_language_processing.utils import get_tokenizer
+from onyx.natural_language_processing.utils import tokenizer_trim_content
+from onyx.prompts.agent_search import HISTORY_FRAMING_PROMPT
+from onyx.prompts.agent_search import SUB_QUESTION_RAG_PROMPT
+from onyx.prompts.prompt_utils import build_date_time_string
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def build_sub_question_answer_prompt(
+    question: str,
+    original_question: str,
+    docs: list[InferenceSection],
+    persona_specification: str,
+    config: LLMConfig,
+) -> list[SystemMessage | HumanMessage | AIMessage | ToolMessage]:
+    system_message = SystemMessage(
+        content=persona_specification,
+    )
+
+    date_str = build_date_time_string()
+
+    # TODO: This should include document metadata and title
+    docs_format_list = [
+        f"Document Number: [D{doc_num + 1}]\nContent: {doc.combined_content}\n\n"
+        for doc_num, doc in enumerate(docs)
+    ]
+
+    docs_str = "\n\n".join(docs_format_list)
+
+    docs_str = trim_prompt_piece(
+        config,
+        docs_str,
+        SUB_QUESTION_RAG_PROMPT + question + original_question + date_str,
+    )
+    human_message = HumanMessage(
+        content=SUB_QUESTION_RAG_PROMPT.format(
+            question=question,
+            original_question=original_question,
+            context=docs_str,
+            date_prompt=date_str,
+        )
+    )
+
+    return [system_message, human_message]
+
+
+def trim_prompt_piece(config: LLMConfig, prompt_piece: str, reserved_str: str) -> str:
+    # TODO: save the max input tokens in LLMConfig
+    max_tokens = get_max_input_tokens(
+        model_provider=config.model_provider,
+        model_name=config.model_name,
+    )
+
+    # no need to trim if a conservative estimate of one token
+    # per character is already less than the max tokens
+    if len(prompt_piece) + len(reserved_str) < max_tokens:
+        return prompt_piece
+
+    llm_tokenizer = get_tokenizer(
+        provider_type=config.model_provider,
+        model_name=config.model_name,
+    )
+
+    # slightly conservative trimming
+    return tokenizer_trim_content(
+        content=prompt_piece,
+        desired_length=max_tokens - len(llm_tokenizer.encode(reserved_str)),
+        tokenizer=llm_tokenizer,
+    )
+
+
+def build_history_prompt(config: GraphConfig, question: str) -> str:
+    prompt_builder = config.inputs.prompt_builder
+    persona_base = get_persona_agent_prompt_expressions(
+        config.inputs.search_request.persona
+    ).base_prompt
+
+    if prompt_builder is None:
+        return ""
+
+    if prompt_builder.single_message_history is not None:
+        history = prompt_builder.single_message_history
+    else:
+        history_components = []
+        previous_message_type = None
+        for message in prompt_builder.raw_message_history:
+            if message.message_type == MessageType.USER:
+                history_components.append(f"User: {message.message}\n")
+                previous_message_type = MessageType.USER
+            elif message.message_type == MessageType.ASSISTANT:
+                # Previously there could be multiple assistant messages in a row
+                # Now this is handled at the message history construction
+                assert previous_message_type is not MessageType.ASSISTANT
+                history_components.append(f"You/Agent: {message.message}\n")
+                previous_message_type = MessageType.ASSISTANT
+            else:
+                # Other message types are not included here, currently there should be no other message types
+                logger.error(
+                    f"Unhandled message type: {message.message_type} with message: {message.message}"
+                )
+                continue
+
+        history = "\n".join(history_components)
+        history = remove_document_citations(history)
+        if len(history.split()) > AGENT_MAX_STATIC_HISTORY_WORD_LENGTH:
+            history = summarize_history(
+                history=history,
+                question=question,
+                persona_specification=persona_base,
+                llm=config.tooling.fast_llm,
+            )
+
+    return HISTORY_FRAMING_PROMPT.format(history=history) if history else ""
+
+
+def get_prompt_enrichment_components(
+    config: GraphConfig,
+) -> AgentPromptEnrichmentComponents:
+    persona_prompts = get_persona_agent_prompt_expressions(
+        config.inputs.search_request.persona
+    )
+
+    history = build_history_prompt(config, config.inputs.search_request.query)
+
+    date_str = build_date_time_string()
+
+    return AgentPromptEnrichmentComponents(
+        persona_prompts=persona_prompts,
+        history=history,
+        date_str=date_str,
+    )
--- a/backend/onyx/agents/agent_search/shared_graph_utils/calculations.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/calculations.py
@@ -0,0 +1,98 @@
+import numpy as np
+
+from onyx.agents.agent_search.shared_graph_utils.models import RetrievalFitScoreMetrics
+from onyx.agents.agent_search.shared_graph_utils.models import RetrievalFitStats
+from onyx.chat.models import SectionRelevancePiece
+from onyx.context.search.models import InferenceSection
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def unique_chunk_id(doc: InferenceSection) -> str:
+    return f"{doc.center_chunk.document_id}_{doc.center_chunk.chunk_id}"
+
+
+def calculate_rank_shift(list1: list, list2: list, top_n: int = 20) -> float:
+    shift = 0
+    for rank_first, doc_id in enumerate(list1[:top_n], 1):
+        try:
+            rank_second = list2.index(doc_id) + 1
+        except ValueError:
+            rank_second = len(list2)  # Document not found in second list
+
+        shift += np.abs(rank_first - rank_second) / np.log(1 + rank_first * rank_second)
+
+    return shift / top_n
+
+
+def get_fit_scores(
+    pre_reranked_results: list[InferenceSection],
+    post_reranked_results: list[InferenceSection] | list[SectionRelevancePiece],
+) -> RetrievalFitStats | None:
+    """
+    Calculate retrieval metrics for search purposes
+    """
+
+    if len(pre_reranked_results) == 0 or len(post_reranked_results) == 0:
+        return None
+
+    ranked_sections = {
+        "initial": pre_reranked_results,
+        "reranked": post_reranked_results,
+    }
+
+    fit_eval: RetrievalFitStats = RetrievalFitStats(
+        fit_score_lift=0,
+        rerank_effect=0,
+        fit_scores={
+            "initial": RetrievalFitScoreMetrics(scores={}, chunk_ids=[]),
+            "reranked": RetrievalFitScoreMetrics(scores={}, chunk_ids=[]),
+        },
+    )
+
+    for rank_type, docs in ranked_sections.items():
+        logger.debug(f"rank_type: {rank_type}")
+
+        for i in [1, 5, 10]:
+            fit_eval.fit_scores[rank_type].scores[str(i)] = (
+                sum(
+                    [
+                        float(doc.center_chunk.score)
+                        for doc in docs[:i]
+                        if type(doc) == InferenceSection
+                        and doc.center_chunk.score is not None
+                    ]
+                )
+                / i
+            )
+
+        fit_eval.fit_scores[rank_type].scores["fit_score"] = (
+            1
+            / 3
+            * (
+                fit_eval.fit_scores[rank_type].scores["1"]
+                + fit_eval.fit_scores[rank_type].scores["5"]
+                + fit_eval.fit_scores[rank_type].scores["10"]
+            )
+        )
+
+        fit_eval.fit_scores[rank_type].scores["fit_score"] = fit_eval.fit_scores[
+            rank_type
+        ].scores["1"]
+
+        fit_eval.fit_scores[rank_type].chunk_ids = [
+            unique_chunk_id(doc) for doc in docs if type(doc) == InferenceSection
+        ]
+
+    fit_eval.fit_score_lift = (
+        fit_eval.fit_scores["reranked"].scores["fit_score"]
+        / fit_eval.fit_scores["initial"].scores["fit_score"]
+    )
+
+    fit_eval.rerank_effect = calculate_rank_shift(
+        fit_eval.fit_scores["initial"].chunk_ids,
+        fit_eval.fit_scores["reranked"].chunk_ids,
+    )
+
+    return fit_eval
--- a/backend/onyx/agents/agent_search/shared_graph_utils/models.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/models.py
@@ -0,0 +1,114 @@
+from pydantic import BaseModel
+
+from onyx.agents.agent_search.deep_search.main.models import (
+    AgentAdditionalMetrics,
+)
+from onyx.agents.agent_search.deep_search.main.models import AgentBaseMetrics
+from onyx.agents.agent_search.deep_search.main.models import (
+    AgentRefinedMetrics,
+)
+from onyx.agents.agent_search.deep_search.main.models import AgentTimings
+from onyx.context.search.models import InferenceSection
+from onyx.tools.models import SearchQueryInfo
+
+
+class RetrievalFitScoreMetrics(BaseModel):
+    scores: dict[str, float]
+    chunk_ids: list[str]
+
+
+class RetrievalFitStats(BaseModel):
+    fit_score_lift: float
+    rerank_effect: float
+    fit_scores: dict[str, RetrievalFitScoreMetrics]
+
+
+# class AgentChunkScores(BaseModel):
+#     scores: dict[str, dict[str, list[int | float]]]
+
+
+class AgentChunkRetrievalStats(BaseModel):
+    verified_count: int | None = None
+    verified_avg_scores: float | None = None
+    rejected_count: int | None = None
+    rejected_avg_scores: float | None = None
+    verified_doc_chunk_ids: list[str] = []
+    dismissed_doc_chunk_ids: list[str] = []
+
+
+class InitialAgentResultStats(BaseModel):
+    sub_questions: dict[str, float | int | None]
+    original_question: dict[str, float | int | None]
+    agent_effectiveness: dict[str, float | int | None]
+
+
+class RefinedAgentStats(BaseModel):
+    revision_doc_efficiency: float | None
+    revision_question_efficiency: float | None
+
+
+class Term(BaseModel):
+    term_name: str = ""
+    term_type: str = ""
+    term_similar_to: list[str] = []
+
+
+### Models ###
+
+
+class Entity(BaseModel):
+    entity_name: str = ""
+    entity_type: str = ""
+
+
+class Relationship(BaseModel):
+    relationship_name: str = ""
+    relationship_type: str = ""
+    relationship_entities: list[str] = []
+
+
+class EntityRelationshipTermExtraction(BaseModel):
+    entities: list[Entity] = []
+    relationships: list[Relationship] = []
+    terms: list[Term] = []
+
+
+class EntityExtractionResult(BaseModel):
+    retrieved_entities_relationships: EntityRelationshipTermExtraction
+
+
+class QueryRetrievalResult(BaseModel):
+    query: str
+    retrieved_documents: list[InferenceSection]
+    stats: RetrievalFitStats | None
+    query_info: SearchQueryInfo | None
+
+
+class SubQuestionAnswerResults(BaseModel):
+    question: str
+    question_id: str
+    answer: str
+    verified_high_quality: bool
+    sub_query_retrieval_results: list[QueryRetrievalResult]
+    verified_reranked_documents: list[InferenceSection]
+    context_documents: list[InferenceSection]
+    cited_documents: list[InferenceSection]
+    sub_question_retrieval_stats: AgentChunkRetrievalStats
+
+
+class CombinedAgentMetrics(BaseModel):
+    timings: AgentTimings
+    base_metrics: AgentBaseMetrics | None
+    refined_metrics: AgentRefinedMetrics
+    additional_metrics: AgentAdditionalMetrics
+
+
+class PersonaPromptExpressions(BaseModel):
+    contextualized_prompt: str
+    base_prompt: str | None
+
+
+class AgentPromptEnrichmentComponents(BaseModel):
+    persona_prompts: PersonaPromptExpressions
+    history: str
+    date_str: str
--- a/backend/onyx/agents/agent_search/shared_graph_utils/operators.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/operators.py
@@ -0,0 +1,31 @@
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    SubQuestionAnswerResults,
+)
+from onyx.chat.prune_and_merge import _merge_sections
+from onyx.context.search.models import InferenceSection
+
+
+def dedup_inference_sections(
+    list1: list[InferenceSection], list2: list[InferenceSection]
+) -> list[InferenceSection]:
+    deduped = _merge_sections(list1 + list2)
+    return deduped
+
+
+def dedup_question_answer_results(
+    question_answer_results_1: list[SubQuestionAnswerResults],
+    question_answer_results_2: list[SubQuestionAnswerResults],
+) -> list[SubQuestionAnswerResults]:
+    deduped_question_answer_results: list[
+        SubQuestionAnswerResults
+    ] = question_answer_results_1
+    utilized_question_ids: set[str] = set(
+        [x.question_id for x in question_answer_results_1]
+    )
+
+    for question_answer_result in question_answer_results_2:
+        if question_answer_result.question_id not in utilized_question_ids:
+            deduped_question_answer_results.append(question_answer_result)
+            utilized_question_ids.add(question_answer_result.question_id)
+
+    return deduped_question_answer_results
--- a/backend/onyx/agents/agent_search/shared_graph_utils/utils.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/utils.py
@@ -0,0 +1,433 @@
+import os
+import re
+from collections.abc import Callable
+from collections.abc import Iterator
+from collections.abc import Sequence
+from datetime import datetime
+from typing import Any
+from typing import cast
+from typing import Literal
+from typing import TypedDict
+from uuid import UUID
+
+from langchain_core.messages import BaseMessage
+from langchain_core.messages import HumanMessage
+from langgraph.types import StreamWriter
+from sqlalchemy.orm import Session
+
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.models import GraphInputs
+from onyx.agents.agent_search.models import GraphPersistence
+from onyx.agents.agent_search.models import GraphSearchConfig
+from onyx.agents.agent_search.models import GraphTooling
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    EntityRelationshipTermExtraction,
+)
+from onyx.agents.agent_search.shared_graph_utils.models import PersonaPromptExpressions
+from onyx.chat.models import AnswerPacket
+from onyx.chat.models import AnswerStyleConfig
+from onyx.chat.models import CitationConfig
+from onyx.chat.models import DocumentPruningConfig
+from onyx.chat.models import PromptConfig
+from onyx.chat.models import SectionRelevancePiece
+from onyx.chat.models import StreamStopInfo
+from onyx.chat.models import StreamStopReason
+from onyx.chat.models import StreamType
+from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
+from onyx.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
+from onyx.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
+from onyx.configs.constants import DEFAULT_PERSONA_ID
+from onyx.configs.constants import DISPATCH_SEP_CHAR
+from onyx.configs.constants import FORMAT_DOCS_SEPARATOR
+from onyx.context.search.enums import LLMEvaluationType
+from onyx.context.search.models import InferenceSection
+from onyx.context.search.models import RetrievalDetails
+from onyx.context.search.models import SearchRequest
+from onyx.db.engine import get_session_context_manager
+from onyx.db.persona import get_persona_by_id
+from onyx.db.persona import Persona
+from onyx.llm.interfaces import LLM
+from onyx.prompts.agent_search import (
+    ASSISTANT_SYSTEM_PROMPT_DEFAULT,
+)
+from onyx.prompts.agent_search import (
+    ASSISTANT_SYSTEM_PROMPT_PERSONA,
+)
+from onyx.prompts.agent_search import (
+    HISTORY_CONTEXT_SUMMARY_PROMPT,
+)
+from onyx.prompts.prompt_utils import handle_onyx_date_awareness
+from onyx.tools.force import ForceUseTool
+from onyx.tools.tool_constructor import SearchToolConfig
+from onyx.tools.tool_implementations.search.search_tool import (
+    SEARCH_RESPONSE_SUMMARY_ID,
+)
+from onyx.tools.tool_implementations.search.search_tool import SearchResponseSummary
+from onyx.tools.tool_implementations.search.search_tool import SearchTool
+from onyx.tools.utils import explicit_tool_calling_supported
+
+BaseMessage_Content = str | list[str | dict[str, Any]]
+
+
+# Post-processing
+def format_docs(docs: Sequence[InferenceSection]) -> str:
+    formatted_doc_list = []
+
+    for doc_num, doc in enumerate(docs):
+        title: str | None = doc.center_chunk.title
+        metadata: dict[str, str | list[str]] | None = (
+            doc.center_chunk.metadata if doc.center_chunk.metadata else None
+        )
+
+        doc_str = f"**Document: D{doc_num + 1}**"
+        if title:
+            doc_str += f"\nTitle: {title}"
+        if metadata:
+            metadata_str = ""
+            for key, value in metadata.items():
+                if isinstance(value, str):
+                    metadata_str += f" - {key}: {value}"
+                elif isinstance(value, list):
+                    metadata_str += f" - {key}: {', '.join(value)}"
+            doc_str += f"\nMetadata: {metadata_str}"
+        doc_str += f"\nContent:\n{doc.combined_content}"
+
+        formatted_doc_list.append(doc_str)
+
+    return FORMAT_DOCS_SEPARATOR.join(formatted_doc_list)
+
+
+def format_entity_term_extraction(
+    entity_term_extraction_dict: EntityRelationshipTermExtraction,
+) -> str:
+    entities = entity_term_extraction_dict.entities
+    terms = entity_term_extraction_dict.terms
+    relationships = entity_term_extraction_dict.relationships
+
+    entity_strs = ["\nEntities:\n"]
+    for entity in entities:
+        entity_str = f"{entity.entity_name} ({entity.entity_type})"
+        entity_strs.append(entity_str)
+
+    entity_str = "\n - ".join(entity_strs)
+
+    relationship_strs = ["\n\nRelationships:\n"]
+    for relationship in relationships:
+        relationship_name = relationship.relationship_name
+        relationship_type = relationship.relationship_type
+        relationship_entities = relationship.relationship_entities
+        relationship_str = (
+            f"""{relationship_name} ({relationship_type}): {relationship_entities}"""
+        )
+        relationship_strs.append(relationship_str)
+
+    relationship_str = "\n - ".join(relationship_strs)
+
+    term_strs = ["\n\nTerms:\n"]
+    for term in terms:
+        term_str = f"{term.term_name} ({term.term_type}): similar to {', '.join(term.term_similar_to)}"
+        term_strs.append(term_str)
+
+    term_str = "\n - ".join(term_strs)
+
+    return "\n".join(entity_strs + relationship_strs + term_strs)
+
+
+def get_test_config(
+    db_session: Session,
+    primary_llm: LLM,
+    fast_llm: LLM,
+    search_request: SearchRequest,
+    use_agentic_search: bool = True,
+) -> GraphConfig:
+    persona = get_persona_by_id(DEFAULT_PERSONA_ID, None, db_session)
+    document_pruning_config = DocumentPruningConfig(
+        max_chunks=int(
+            persona.num_chunks
+            if persona.num_chunks is not None
+            else MAX_CHUNKS_FED_TO_CHAT
+        ),
+        max_window_percentage=CHAT_TARGET_CHUNK_PERCENTAGE,
+    )
+
+    answer_style_config = AnswerStyleConfig(
+        citation_config=CitationConfig(
+            # The docs retrieved by this flow are already relevance-filtered
+            all_docs_useful=True
+        ),
+        document_pruning_config=document_pruning_config,
+        structured_response_format=None,
+    )
+
+    search_tool_config = SearchToolConfig(
+        answer_style_config=answer_style_config,
+        document_pruning_config=document_pruning_config,
+        retrieval_options=RetrievalDetails(),  # may want to set dedupe_docs=True
+        rerank_settings=None,  # Can use this to change reranking model
+        selected_sections=None,
+        latest_query_files=None,
+        bypass_acl=False,
+    )
+
+    prompt_config = PromptConfig.from_model(persona.prompts[0])
+
+    search_tool = SearchTool(
+        db_session=db_session,
+        user=None,
+        persona=persona,
+        retrieval_options=search_tool_config.retrieval_options,
+        prompt_config=prompt_config,
+        llm=primary_llm,
+        fast_llm=fast_llm,
+        pruning_config=search_tool_config.document_pruning_config,
+        answer_style_config=search_tool_config.answer_style_config,
+        selected_sections=search_tool_config.selected_sections,
+        chunks_above=search_tool_config.chunks_above,
+        chunks_below=search_tool_config.chunks_below,
+        full_doc=search_tool_config.full_doc,
+        evaluation_type=(
+            LLMEvaluationType.BASIC
+            if persona.llm_relevance_filter
+            else LLMEvaluationType.SKIP
+        ),
+        rerank_settings=search_tool_config.rerank_settings,
+        bypass_acl=search_tool_config.bypass_acl,
+    )
+
+    graph_inputs = GraphInputs(
+        search_request=search_request,
+        prompt_builder=AnswerPromptBuilder(
+            user_message=HumanMessage(content=search_request.query),
+            message_history=[],
+            llm_config=primary_llm.config,
+            raw_user_query=search_request.query,
+            raw_user_uploaded_files=[],
+        ),
+        structured_response_format=answer_style_config.structured_response_format,
+    )
+
+    using_tool_calling_llm = explicit_tool_calling_supported(
+        primary_llm.config.model_provider, primary_llm.config.model_name
+    )
+    graph_tooling = GraphTooling(
+        primary_llm=primary_llm,
+        fast_llm=fast_llm,
+        search_tool=search_tool,
+        tools=[search_tool],
+        force_use_tool=ForceUseTool(force_use=False, tool_name=""),
+        using_tool_calling_llm=using_tool_calling_llm,
+    )
+
+    chat_session_id = os.environ.get("ONYX_AS_CHAT_SESSION_ID")
+    assert (
+        chat_session_id is not None
+    ), "ONYX_AS_CHAT_SESSION_ID must be set for backend tests"
+    graph_persistence = GraphPersistence(
+        db_session=db_session,
+        chat_session_id=UUID(chat_session_id),
+        message_id=1,
+    )
+
+    search_behavior_config = GraphSearchConfig(
+        use_agentic_search=use_agentic_search,
+        skip_gen_ai_answer_generation=False,
+        allow_refinement=True,
+    )
+    graph_config = GraphConfig(
+        inputs=graph_inputs,
+        tooling=graph_tooling,
+        persistence=graph_persistence,
+        behavior=search_behavior_config,
+    )
+
+    return graph_config
+
+
+def get_persona_agent_prompt_expressions(
+    persona: Persona | None,
+) -> PersonaPromptExpressions:
+    if persona is None or len(persona.prompts) == 0:
+        # TODO base_prompt should be None, but no time to properly fix
+        return PersonaPromptExpressions(
+            contextualized_prompt=ASSISTANT_SYSTEM_PROMPT_DEFAULT, base_prompt=""
+        )
+
+    # Only a 1:1 mapping between personas and prompts currently
+    prompt = persona.prompts[0]
+    prompt_config = PromptConfig.from_model(prompt)
+    datetime_aware_system_prompt = handle_onyx_date_awareness(
+        prompt_str=prompt_config.system_prompt,
+        prompt_config=prompt_config,
+        add_additional_info_if_no_tag=prompt.datetime_aware,
+    )
+
+    return PersonaPromptExpressions(
+        contextualized_prompt=ASSISTANT_SYSTEM_PROMPT_PERSONA.format(
+            persona_prompt=datetime_aware_system_prompt
+        ),
+        base_prompt=datetime_aware_system_prompt,
+    )
+
+
+def make_question_id(level: int, question_num: int) -> str:
+    return f"{level}_{question_num}"
+
+
+def parse_question_id(question_id: str) -> tuple[int, int]:
+    level, question_num = question_id.split("_")
+    return int(level), int(question_num)
+
+
+def _dispatch_nonempty(
+    content: str, dispatch_event: Callable[[str, int], None], sep_num: int
+) -> None:
+    """
+    Dispatch a content string if it is not empty using the given callback.
+    This function is used in the context of dispatching some arbitrary number
+    of similar objects which are separated by a separator during the LLM stream.
+    The callback expects a sep_num denoting which object is being dispatched; these
+    numbers go from 1 to however many strings the LLM decides to stream.
+    """
+    if content != "":
+        dispatch_event(content, sep_num)
+
+
+def dispatch_separated(
+    tokens: Iterator[BaseMessage],
+    dispatch_event: Callable[[str, int], None],
+    sep: str = DISPATCH_SEP_CHAR,
+) -> list[BaseMessage_Content]:
+    num = 1
+    streamed_tokens: list[BaseMessage_Content] = []
+    for token in tokens:
+        content = cast(str, token.content)
+        if sep in content:
+            sub_question_parts = content.split(sep)
+            _dispatch_nonempty(sub_question_parts[0], dispatch_event, num)
+            num += 1
+            _dispatch_nonempty(
+                "".join(sub_question_parts[1:]).strip(), dispatch_event, num
+            )
+        else:
+            _dispatch_nonempty(content, dispatch_event, num)
+        streamed_tokens.append(content)
+
+    return streamed_tokens
+
+
+def dispatch_main_answer_stop_info(level: int, writer: StreamWriter) -> None:
+    stop_event = StreamStopInfo(
+        stop_reason=StreamStopReason.FINISHED,
+        stream_type=StreamType.MAIN_ANSWER,
+        level=level,
+    )
+    write_custom_event("stream_finished", stop_event, writer)
+
+
+def retrieve_search_docs(
+    search_tool: SearchTool, question: str
+) -> list[InferenceSection]:
+    retrieved_docs: list[InferenceSection] = []
+
+    # new db session to avoid concurrency issues
+    with get_session_context_manager() as db_session:
+        for tool_response in search_tool.run(
+            query=question,
+            force_no_rerank=True,
+            alternate_db_session=db_session,
+        ):
+            # get retrieved docs to send to the rest of the graph
+            if tool_response.id == SEARCH_RESPONSE_SUMMARY_ID:
+                response = cast(SearchResponseSummary, tool_response.response)
+                retrieved_docs = response.top_sections
+                break
+
+    return retrieved_docs
+
+
+def get_answer_citation_ids(answer_str: str) -> list[int]:
+    """
+    Extract citation numbers of format [D<number>] from the answer string.
+    """
+    citation_ids = re.findall(r"\[D(\d+)\]", answer_str)
+    return list(set([(int(id) - 1) for id in citation_ids]))
+
+
+def summarize_history(
+    history: str, question: str, persona_specification: str | None, llm: LLM
+) -> str:
+    history_context_prompt = remove_document_citations(
+        HISTORY_CONTEXT_SUMMARY_PROMPT.format(
+            persona_specification=persona_specification,
+            question=question,
+            history=history,
+        )
+    )
+
+    history_response = llm.invoke(history_context_prompt)
+    assert isinstance(history_response.content, str)
+    return history_response.content
+
+
+# taken from langchain_core.runnables.schema
+# we don't use the one from their library because
+# it includes ids they generate
+class CustomStreamEvent(TypedDict):
+    # Overwrite the event field to be more specific.
+    event: Literal["on_custom_event"]  # type: ignore[misc]
+    """The event type."""
+    name: str
+    """User defined name for the event."""
+    data: Any
+    """The data associated with the event. Free form and can be anything."""
+
+
+def write_custom_event(
+    name: str, event: AnswerPacket, stream_writer: StreamWriter
+) -> None:
+    stream_writer(CustomStreamEvent(event="on_custom_event", name=name, data=event))
+
+
+def relevance_from_docs(
+    relevant_docs: list[InferenceSection],
+) -> list[SectionRelevancePiece]:
+    return [
+        SectionRelevancePiece(
+            relevant=True,
+            content=doc.center_chunk.content,
+            document_id=doc.center_chunk.document_id,
+            chunk_id=doc.center_chunk.chunk_id,
+        )
+        for doc in relevant_docs
+    ]
+
+
+def get_langgraph_node_log_string(
+    graph_component: str,
+    node_name: str,
+    node_start_time: datetime,
+    result: str | None = None,
+) -> str:
+    duration = datetime.now() - node_start_time
+    results_str = "" if result is None else f" -- Result: {result}"
+    return f"{node_start_time} -- {graph_component} - {node_name} -- Time taken: {duration}{results_str}"
+
+
+def remove_document_citations(text: str) -> str:
+    """
+    Removes citation expressions of format '[[D1]]()' from text.
+    The number after D can vary.
+
+    Args:
+        text: Input text containing citations
+
+    Returns:
+        Text with citations removed
+    """
+    # Pattern explanation:
+    # \[(?:D|Q)?\d+\]  matches:
+    #   \[   - literal [ character
+    #   (?:D|Q)?  - optional D or Q character
+    #   \d+  - one or more digits
+    #   \]   - literal ] character
+    return re.sub(r"\[(?:D|Q)?\d+\]", "", text)
--- a/backend/onyx/chat/answer.py
+++ b/backend/onyx/chat/answer.py
@@ -1,281 +1,140 @@
+from collections import defaultdict
 from collections.abc import Callable
-from collections.abc import Iterator
-from uuid import uuid4
+from uuid import UUID

-from langchain.schema.messages import BaseMessage
-from langchain_core.messages import AIMessageChunk
-from langchain_core.messages import ToolCall
+from sqlalchemy.orm import Session

-from onyx.chat.llm_response_handler import LLMResponseHandlerManager
-from onyx.chat.models import AnswerQuestionPossibleReturn
+from onyx.agents.agent_search.models import GraphConfig
+from onyx.agents.agent_search.models import GraphInputs
+from onyx.agents.agent_search.models import GraphPersistence
+from onyx.agents.agent_search.models import GraphSearchConfig
+from onyx.agents.agent_search.models import GraphTooling
+from onyx.agents.agent_search.run_graph import run_basic_graph
+from onyx.agents.agent_search.run_graph import run_main_graph
+from onyx.chat.models import AgentAnswerPiece
+from onyx.chat.models import AnswerPacket
+from onyx.chat.models import AnswerStream
 from onyx.chat.models import AnswerStyleConfig
 from onyx.chat.models import CitationInfo
 from onyx.chat.models import OnyxAnswerPiece
-from onyx.chat.models import PromptConfig
+from onyx.chat.models import StreamStopInfo
+from onyx.chat.models import StreamStopReason
+from onyx.chat.models import SubQuestionKey
 from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
-from onyx.chat.prompt_builder.answer_prompt_builder import default_build_system_message
-from onyx.chat.prompt_builder.answer_prompt_builder import default_build_user_message
-from onyx.chat.prompt_builder.answer_prompt_builder import LLMCall
-from onyx.chat.stream_processing.answer_response_handler import (
-    CitationResponseHandler,
-)
-from onyx.chat.stream_processing.answer_response_handler import (
-    DummyAnswerResponseHandler,
-)
-from onyx.chat.stream_processing.utils import (
-    map_document_id_order,
-)
-from onyx.chat.tool_handling.tool_response_handler import ToolResponseHandler
+from onyx.configs.constants import BASIC_KEY
+from onyx.context.search.models import SearchRequest
 from onyx.file_store.utils import InMemoryChatFile
 from onyx.llm.interfaces import LLM
-from onyx.llm.models import PreviousMessage
-from onyx.natural_language_processing.utils import get_tokenizer
 from onyx.tools.force import ForceUseTool
-from onyx.tools.models import ToolResponse
 from onyx.tools.tool import Tool
 from onyx.tools.tool_implementations.search.search_tool import SearchTool
-from onyx.tools.tool_runner import ToolCallKickoff
 from onyx.tools.utils import explicit_tool_calling_supported
 from onyx.utils.logger import setup_logger

-
 logger = setup_logger()

-
-AnswerStream = Iterator[AnswerQuestionPossibleReturn | ToolCallKickoff | ToolResponse]
+BASIC_SQ_KEY = SubQuestionKey(level=BASIC_KEY[0], question_num=BASIC_KEY[1])


 class Answer:
    def __init__(
        self,
-        question: str,
+        prompt_builder: AnswerPromptBuilder,
        answer_style_config: AnswerStyleConfig,
        llm: LLM,
-        prompt_config: PromptConfig,
+        fast_llm: LLM,
        force_use_tool: ForceUseTool,
-        # must be the same length as `docs`. If None, all docs are considered "relevant"
-        message_history: list[PreviousMessage] | None = None,
-        single_message_history: str | None = None,
+        search_request: SearchRequest,
+        chat_session_id: UUID,
+        current_agent_message_id: int,
+        db_session: Session,
        # newly passed in files to include as part of this question
        # TODO THIS NEEDS TO BE HANDLED
        latest_query_files: list[InMemoryChatFile] | None = None,
-        files: list[InMemoryChatFile] | None = None,
        tools: list[Tool] | None = None,
        # NOTE: for native tool-calling, this is only supported by OpenAI atm,
        #       but we only support them anyways
        # if set to True, then never use the LLMs provided tool-calling functonality
        skip_explicit_tool_calling: bool = False,
-        # Returns the full document sections text from the search tool
-        return_contexts: bool = False,
        skip_gen_ai_answer_generation: bool = False,
        is_connected: Callable[[], bool] | None = None,
+        use_agentic_search: bool = False,
    ) -> None:
-        if single_message_history and message_history:
-            raise ValueError(
-                "Cannot provide both `message_history` and `single_message_history`"
-            )
-
-        self.question = question
        self.is_connected: Callable[[], bool] | None = is_connected
-
-        self.latest_query_files = latest_query_files or []
-        self.file_id_to_file = {file.file_id: file for file in (files or [])}
-
-        self.tools = tools or []
-        self.force_use_tool = force_use_tool
-
-        self.message_history = message_history or []
-        # used for QA flow where we only want to send a single message
-        self.single_message_history = single_message_history
-
-        self.answer_style_config = answer_style_config
-        self.prompt_config = prompt_config
-
-        self.llm = llm
-        self.llm_tokenizer = get_tokenizer(
-            provider_type=llm.config.model_provider,
-            model_name=llm.config.model_name,
-        )
-
-        self._final_prompt: list[BaseMessage] | None = None
-
-        self._streamed_output: list[str] | None = None
-        self._processed_stream: (
-            list[AnswerQuestionPossibleReturn | ToolResponse | ToolCallKickoff] | None
-        ) = None
-
-        self._return_contexts = return_contexts
-        self.skip_gen_ai_answer_generation = skip_gen_ai_answer_generation
+        self._processed_stream: (list[AnswerPacket] | None) = None
        self._is_cancelled = False

-        self.using_tool_calling_llm = (
+        search_tools = [tool for tool in (tools or []) if isinstance(tool, SearchTool)]
+        search_tool: SearchTool | None = None
+
+        if len(search_tools) > 1:
+            # TODO: handle multiple search tools
+            raise ValueError("Multiple search tools found")
+        elif len(search_tools) == 1:
+            search_tool = search_tools[0]
+
+        using_tool_calling_llm = (
            explicit_tool_calling_supported(
-                self.llm.config.model_provider, self.llm.config.model_name
+                llm.config.model_provider, llm.config.model_name
            )
            and not skip_explicit_tool_calling
        )

-    def _get_tools_list(self) -> list[Tool]:
-        if not self.force_use_tool.force_use:
-            return self.tools
-
-        tool = next(
-            (t for t in self.tools if t.name == self.force_use_tool.tool_name), None
+        self.graph_inputs = GraphInputs(
+            search_request=search_request,
+            prompt_builder=prompt_builder,
+            files=latest_query_files,
+            structured_response_format=answer_style_config.structured_response_format,
        )
-        if tool is None:
-            raise RuntimeError(f"Tool '{self.force_use_tool.tool_name}' not found")
-
-        logger.info(
-            f"Forcefully using tool='{tool.name}'"
-            + (
-                f" with args='{self.force_use_tool.args}'"
-                if self.force_use_tool.args is not None
-                else ""
-            )
+        self.graph_tooling = GraphTooling(
+            primary_llm=llm,
+            fast_llm=fast_llm,
+            search_tool=search_tool,
+            tools=tools or [],
+            force_use_tool=force_use_tool,
+            using_tool_calling_llm=using_tool_calling_llm,
        )
-        return [tool]
-
-    def _handle_specified_tool_call(
-        self, llm_calls: list[LLMCall], tool: Tool, tool_args: dict
-    ) -> AnswerStream:
-        current_llm_call = llm_calls[-1]
-
-        # make a dummy tool handler
-        tool_handler = ToolResponseHandler([tool])
-
-        dummy_tool_call_chunk = AIMessageChunk(content="")
-        dummy_tool_call_chunk.tool_calls = [
-            ToolCall(name=tool.name, args=tool_args, id=str(uuid4()))
-        ]
-
-        response_handler_manager = LLMResponseHandlerManager(
-            tool_handler, DummyAnswerResponseHandler(), self.is_cancelled
+        assert db_session, "db_session must be provided for agentic persistence"
+        self.graph_persistence = GraphPersistence(
+            db_session=db_session,
+            chat_session_id=chat_session_id,
+            message_id=current_agent_message_id,
        )
-        yield from response_handler_manager.handle_llm_response(
-            iter([dummy_tool_call_chunk])
+        self.search_behavior_config = GraphSearchConfig(
+            use_agentic_search=use_agentic_search,
+            skip_gen_ai_answer_generation=skip_gen_ai_answer_generation,
+            allow_refinement=True,
        )
-
-        new_llm_call = response_handler_manager.next_llm_call(current_llm_call)
-        if new_llm_call:
-            yield from self._get_response(llm_calls + [new_llm_call])
-        else:
-            raise RuntimeError("Tool call handler did not return a new LLM call")
-
-    def _get_response(self, llm_calls: list[LLMCall]) -> AnswerStream:
-        current_llm_call = llm_calls[-1]
-
-        # handle the case where no decision has to be made; we simply run the tool
-        if (
-            current_llm_call.force_use_tool.force_use
-            and current_llm_call.force_use_tool.args is not None
-        ):
-            tool_name, tool_args = (
-                current_llm_call.force_use_tool.tool_name,
-                current_llm_call.force_use_tool.args,
-            )
-            tool = next(
-                (t for t in current_llm_call.tools if t.name == tool_name), None
-            )
-            if not tool:
-                raise RuntimeError(f"Tool '{tool_name}' not found")
-
-            yield from self._handle_specified_tool_call(llm_calls, tool, tool_args)
-            return
-
-        # special pre-logic for non-tool calling LLM case
-        if not self.using_tool_calling_llm and current_llm_call.tools:
-            chosen_tool_and_args = (
-                ToolResponseHandler.get_tool_call_for_non_tool_calling_llm(
-                    current_llm_call, self.llm
-                )
-            )
-            if chosen_tool_and_args:
-                tool, tool_args = chosen_tool_and_args
-                yield from self._handle_specified_tool_call(llm_calls, tool, tool_args)
-                return
-
-        # if we're skipping gen ai answer generation, we should break
-        # out unless we're forcing a tool call. If we don't, we might generate an
-        # answer, which is a no-no!
-        if (
-            self.skip_gen_ai_answer_generation
-            and not current_llm_call.force_use_tool.force_use
-        ):
-            return
-
-        # set up "handlers" to listen to the LLM response stream and
-        # feed back the processed results + handle tool call requests
-        # + figure out what the next LLM call should be
-        tool_call_handler = ToolResponseHandler(current_llm_call.tools)
-
-        final_search_results, displayed_search_results = SearchTool.get_search_result(
-            current_llm_call
-        ) or ([], [])
-
-        answer_handler = CitationResponseHandler(
-            context_docs=final_search_results,
-            final_doc_id_to_rank_map=map_document_id_order(final_search_results),
-            display_doc_id_to_rank_map=map_document_id_order(displayed_search_results),
+        self.graph_config = GraphConfig(
+            inputs=self.graph_inputs,
+            tooling=self.graph_tooling,
+            persistence=self.graph_persistence,
+            behavior=self.search_behavior_config,
        )

-        response_handler_manager = LLMResponseHandlerManager(
-            tool_call_handler, answer_handler, self.is_cancelled
-        )
-
-        # DEBUG: good breakpoint
-        stream = self.llm.stream(
-            # For tool calling LLMs, we want to insert the task prompt as part of this flow, this is because the LLM
-            # may choose to not call any tools and just generate the answer, in which case the task prompt is needed.
-            prompt=current_llm_call.prompt_builder.build(),
-            tools=[tool.tool_definition() for tool in current_llm_call.tools] or None,
-            tool_choice=(
-                "required"
-                if current_llm_call.tools and current_llm_call.force_use_tool.force_use
-                else None
-            ),
-            structured_response_format=self.answer_style_config.structured_response_format,
-        )
-        yield from response_handler_manager.handle_llm_response(stream)
-
-        new_llm_call = response_handler_manager.next_llm_call(current_llm_call)
-        if new_llm_call:
-            yield from self._get_response(llm_calls + [new_llm_call])
-
    @property
    def processed_streamed_output(self) -> AnswerStream:
        if self._processed_stream is not None:
            yield from self._processed_stream
            return

-        prompt_builder = AnswerPromptBuilder(
-            user_message=default_build_user_message(
-                user_query=self.question,
-                prompt_config=self.prompt_config,
-                files=self.latest_query_files,
-                single_message_history=self.single_message_history,
-            ),
-            message_history=self.message_history,
-            llm_config=self.llm.config,
-            raw_user_query=self.question,
-            raw_user_uploaded_files=self.latest_query_files or [],
-            single_message_history=self.single_message_history,
+        run_langgraph = (
+            run_main_graph
+            if self.graph_config.behavior.use_agentic_search
+            else run_basic_graph
        )
-        prompt_builder.update_system_prompt(
-            default_build_system_message(self.prompt_config)
-        )
-        llm_call = LLMCall(
-            prompt_builder=prompt_builder,
-            tools=self._get_tools_list(),
-            force_use_tool=self.force_use_tool,
-            files=self.latest_query_files,
-            tool_call_info=[],
-            using_tool_calling_llm=self.using_tool_calling_llm,
+        stream = run_langgraph(
+            self.graph_config,
        )

        processed_stream = []
-        for processed_packet in self._get_response([llm_call]):
-            processed_stream.append(processed_packet)
-            yield processed_packet
+        for packet in stream:
+            if self.is_cancelled():
+                packet = StreamStopInfo(stop_reason=StreamStopReason.CANCELLED)
+                yield packet
+                break
+            processed_stream.append(packet)
+            yield packet

        self._processed_stream = processed_stream

@@ -283,20 +142,59 @@ class Answer:
    def llm_answer(self) -> str:
        answer = ""
        for packet in self.processed_streamed_output:
-            if isinstance(packet, OnyxAnswerPiece) and packet.answer_piece:
+            # handle basic answer flow, plus level 0 agent answer flow
+            # since level 0 is the first answer the user sees and therefore the
+            # child message of the user message in the db (so it is handled
+            # like a basic flow answer)
+            if (isinstance(packet, OnyxAnswerPiece) and packet.answer_piece) or (
+                isinstance(packet, AgentAnswerPiece)
+                and packet.answer_piece
+                and packet.answer_type == "agent_level_answer"
+                and packet.level == 0
+            ):
                answer += packet.answer_piece

        return answer

+    def llm_answer_by_level(self) -> dict[int, str]:
+        answer_by_level: dict[int, str] = defaultdict(str)
+        for packet in self.processed_streamed_output:
+            if (
+                isinstance(packet, AgentAnswerPiece)
+                and packet.answer_piece
+                and packet.answer_type == "agent_level_answer"
+            ):
+                assert packet.level is not None
+                answer_by_level[packet.level] += packet.answer_piece
+            elif isinstance(packet, OnyxAnswerPiece) and packet.answer_piece:
+                answer_by_level[BASIC_KEY[0]] += packet.answer_piece
+        return answer_by_level
+
    @property
    def citations(self) -> list[CitationInfo]:
        citations: list[CitationInfo] = []
        for packet in self.processed_streamed_output:
-            if isinstance(packet, CitationInfo):
+            if isinstance(packet, CitationInfo) and packet.level is None:
                citations.append(packet)

        return citations

+    def citations_by_subquestion(self) -> dict[SubQuestionKey, list[CitationInfo]]:
+        citations_by_subquestion: dict[
+            SubQuestionKey, list[CitationInfo]
+        ] = defaultdict(list)
+        for packet in self.processed_streamed_output:
+            if isinstance(packet, CitationInfo):
+                if packet.level_question_num is not None and packet.level is not None:
+                    citations_by_subquestion[
+                        SubQuestionKey(
+                            level=packet.level, question_num=packet.level_question_num
+                        )
+                    ].append(packet)
+                elif packet.level is None:
+                    citations_by_subquestion[BASIC_SQ_KEY].append(packet)
+        return citations_by_subquestion
+
    def is_cancelled(self) -> bool:
        if self._is_cancelled:
            return True
--- a/backend/onyx/chat/chat_utils.py
+++ b/backend/onyx/chat/chat_utils.py
@@ -48,6 +48,8 @@ def prepare_chat_message_request(
    retrieval_details: RetrievalDetails | None,
    rerank_settings: RerankingDetails | None,
    db_session: Session,
+    use_agentic_search: bool = False,
+    skip_gen_ai_answer_generation: bool = False,
 ) -> CreateChatMessageRequest:
    # Typically used for one shot flows like SlackBot or non-chat API endpoint use cases
    new_chat_session = create_chat_session(
@@ -72,6 +74,8 @@ def prepare_chat_message_request(
        search_doc_ids=None,
        retrieval_options=retrieval_details,
        rerank_settings=rerank_settings,
+        use_agentic_search=use_agentic_search,
+        skip_gen_ai_answer_generation=skip_gen_ai_answer_generation,
    )


@@ -162,6 +166,7 @@ def create_chat_chain(
        )

    current_message: ChatMessage | None = root_message
+    previous_message: ChatMessage | None = None
    while current_message is not None:
        child_msg = current_message.latest_child_message

@@ -179,7 +184,17 @@ def create_chat_chain(
                "could not find next message in the same session"
            )

-        mainline_messages.append(current_message)
+        if (
+            current_message.message_type == MessageType.ASSISTANT
+            and previous_message is not None
+            and previous_message.message_type == MessageType.ASSISTANT
+            and mainline_messages
+        ):
+            mainline_messages[-1] = current_message
+        else:
+            mainline_messages.append(current_message)
+
+        previous_message = current_message

    if not mainline_messages:
        raise RuntimeError("Could not trace chat message history")
--- a/backend/onyx/chat/llm_response_handler.py
+++ b/backend/onyx/chat/llm_response_handler.py
@@ -9,25 +9,37 @@ from onyx.chat.models import StreamStopInfo
 from onyx.chat.models import StreamStopReason
 from onyx.chat.prompt_builder.answer_prompt_builder import LLMCall
 from onyx.chat.stream_processing.answer_response_handler import AnswerResponseHandler
+from onyx.chat.stream_processing.answer_response_handler import (
+    DummyAnswerResponseHandler,
+)
 from onyx.chat.tool_handling.tool_response_handler import ToolResponseHandler


 class LLMResponseHandlerManager:
+    """
+    This class is responsible for postprocessing the LLM response stream.
+    In particular, we:
+    1. handle the tool call requests
+    2. handle citations
+    3. pass through answers generated by the LLM
+    4. Stop yielding if the client disconnects
+    """
+
    def __init__(
        self,
-        tool_handler: ToolResponseHandler,
-        answer_handler: AnswerResponseHandler,
+        tool_handler: ToolResponseHandler | None,
+        answer_handler: AnswerResponseHandler | None,
        is_cancelled: Callable[[], bool],
    ):
-        self.tool_handler = tool_handler
-        self.answer_handler = answer_handler
+        self.tool_handler = tool_handler or ToolResponseHandler([])
+        self.answer_handler = answer_handler or DummyAnswerResponseHandler()
        self.is_cancelled = is_cancelled

    def handle_llm_response(
        self,
        stream: Iterator[BaseMessage],
    ) -> Generator[ResponsePart, None, None]:
-        all_messages: list[BaseMessage] = []
+        all_messages: list[BaseMessage | str] = []
        for message in stream:
            if self.is_cancelled():
                yield StreamStopInfo(stop_reason=StreamStopReason.CANCELLED)
--- a/backend/onyx/chat/models.py
+++ b/backend/onyx/chat/models.py
@@ -3,6 +3,7 @@ from collections.abc import Iterator
 from datetime import datetime
 from enum import Enum
 from typing import Any
+from typing import Literal
 from typing import TYPE_CHECKING

 from pydantic import BaseModel
@@ -15,6 +16,8 @@ from onyx.context.search.enums import QueryFlow
 from onyx.context.search.enums import RecencyBiasSetting
 from onyx.context.search.enums import SearchType
 from onyx.context.search.models import RetrievalDocs
+from onyx.db.models import SearchDoc as DbSearchDoc
+from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import PromptOverride
 from onyx.tools.models import ToolCallFinalResult
 from onyx.tools.models import ToolCallKickoff
@@ -40,8 +43,13 @@ class LlmDoc(BaseModel):
    match_highlights: list[str] | None


+class SubQuestionIdentifier(BaseModel):
+    level: int | None = None
+    level_question_num: int | None = None
+
+
 # First chunk of info for streaming QA
-class QADocsResponse(RetrievalDocs):
+class QADocsResponse(RetrievalDocs, SubQuestionIdentifier):
    rephrased_query: str | None = None
    predicted_flow: QueryFlow | None
    predicted_search: SearchType | None
@@ -61,11 +69,20 @@ class QADocsResponse(RetrievalDocs):
 class StreamStopReason(Enum):
    CONTEXT_LENGTH = "context_length"
    CANCELLED = "cancelled"
+    FINISHED = "finished"


-class StreamStopInfo(BaseModel):
+class StreamType(Enum):
+    SUB_QUESTIONS = "sub_questions"
+    SUB_ANSWER = "sub_answer"
+    MAIN_ANSWER = "main_answer"
+
+
+class StreamStopInfo(SubQuestionIdentifier):
    stop_reason: StreamStopReason

+    stream_type: StreamType = StreamType.MAIN_ANSWER
+
    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
        data = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
        data["stop_reason"] = self.stop_reason.name
@@ -105,7 +122,7 @@ class OnyxAnswerPiece(BaseModel):

 # An intermediate representation of citations, later translated into
 # a mapping of the citation [n] number to SearchDoc
-class CitationInfo(BaseModel):
+class CitationInfo(SubQuestionIdentifier):
    citation_num: int
    document_id: str

@@ -273,7 +290,7 @@ class AnswerStyleConfig(BaseModel):

 class PromptConfig(BaseModel):
    """Final representation of the Prompt configuration passed
-    into the `Answer` object."""
+    into the `PromptBuilder` object."""

    system_prompt: str
    task_prompt: str
@@ -299,6 +316,41 @@ class PromptConfig(BaseModel):
    model_config = ConfigDict(frozen=True)


+class SubQueryPiece(SubQuestionIdentifier):
+    sub_query: str
+    query_id: int
+
+
+class AgentAnswerPiece(SubQuestionIdentifier):
+    answer_piece: str
+    answer_type: Literal["agent_sub_answer", "agent_level_answer"]
+
+
+class SubQuestionPiece(SubQuestionIdentifier):
+    sub_question: str
+
+
+class ExtendedToolResponse(ToolResponse, SubQuestionIdentifier):
+    pass
+
+
+class RefinedAnswerImprovement(BaseModel):
+    refined_answer_improvement: bool
+
+
+AgentSearchPacket = (
+    SubQuestionPiece
+    | AgentAnswerPiece
+    | SubQueryPiece
+    | ExtendedToolResponse
+    | RefinedAnswerImprovement
+)
+
+AnswerPacket = (
+    AnswerQuestionPossibleReturn | AgentSearchPacket | ToolCallKickoff | ToolResponse
+)
+
+
 ResponsePart = (
    OnyxAnswerPiece
    | CitationInfo
@@ -306,4 +358,33 @@ ResponsePart = (
    | ToolResponse
    | ToolCallFinalResult
    | StreamStopInfo
+    | AgentSearchPacket
 )
+
+AnswerStream = Iterator[AnswerPacket]
+
+
+class AnswerPostInfo(BaseModel):
+    ai_message_files: list[FileDescriptor]
+    qa_docs_response: QADocsResponse | None = None
+    reference_db_search_docs: list[DbSearchDoc] | None = None
+    dropped_indices: list[int] | None = None
+    tool_result: ToolCallFinalResult | None = None
+    message_specific_citations: MessageSpecificCitations | None = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class SubQuestionKey(BaseModel):
+    level: int
+    question_num: int
+
+    def __hash__(self) -> int:
+        return hash((self.level, self.question_num))
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, SubQuestionKey) and (
+            self.level,
+            self.question_num,
+        ) == (other.level, other.question_num)
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -1,4 +1,5 @@
 import traceback
+from collections import defaultdict
 from collections.abc import Callable
 from collections.abc import Iterator
 from functools import partial
@@ -9,13 +10,16 @@ from sqlalchemy.orm import Session
 from onyx.chat.answer import Answer
 from onyx.chat.chat_utils import create_chat_chain
 from onyx.chat.chat_utils import create_temporary_persona
+from onyx.chat.models import AgentSearchPacket
 from onyx.chat.models import AllCitations
+from onyx.chat.models import AnswerPostInfo
 from onyx.chat.models import AnswerStyleConfig
 from onyx.chat.models import ChatOnyxBotResponse
 from onyx.chat.models import CitationConfig
 from onyx.chat.models import CitationInfo
 from onyx.chat.models import CustomToolResponse
 from onyx.chat.models import DocumentPruningConfig
+from onyx.chat.models import ExtendedToolResponse
 from onyx.chat.models import FileChatDisplay
 from onyx.chat.models import FinalUsedContextDocsResponse
 from onyx.chat.models import LLMRelevanceFilterResponse
@@ -25,20 +29,32 @@ from onyx.chat.models import OnyxAnswerPiece
 from onyx.chat.models import OnyxContexts
 from onyx.chat.models import PromptConfig
 from onyx.chat.models import QADocsResponse
+from onyx.chat.models import RefinedAnswerImprovement
 from onyx.chat.models import StreamingError
 from onyx.chat.models import StreamStopInfo
+from onyx.chat.models import StreamStopReason
+from onyx.chat.models import SubQuestionKey
+from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
+from onyx.chat.prompt_builder.answer_prompt_builder import default_build_system_message
+from onyx.chat.prompt_builder.answer_prompt_builder import default_build_user_message
 from onyx.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
 from onyx.configs.chat_configs import DISABLE_LLM_CHOOSE_SEARCH
 from onyx.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
+from onyx.configs.constants import AGENT_SEARCH_INITIAL_KEY
+from onyx.configs.constants import BASIC_KEY
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import MilestoneRecordType
 from onyx.configs.constants import NO_AUTH_USER_ID
+from onyx.context.search.enums import LLMEvaluationType
 from onyx.context.search.enums import OptionalSearchSetting
 from onyx.context.search.enums import QueryFlow
 from onyx.context.search.enums import SearchType
 from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import RetrievalDetails
-from onyx.context.search.retrieval.search_runner import inference_sections_from_ids
+from onyx.context.search.models import SearchRequest
+from onyx.context.search.retrieval.search_runner import (
+    inference_sections_from_ids,
+)
 from onyx.context.search.utils import chunks_or_sections_to_search_docs
 from onyx.context.search.utils import dedupe_documents
 from onyx.context.search.utils import drop_llm_indices
@@ -127,7 +143,6 @@ from onyx.utils.timing import log_function_time
 from onyx.utils.timing import log_generator_function_time
 from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR

-
 logger = setup_logger()


@@ -159,12 +174,15 @@ def _handle_search_tool_response_summary(
 ) -> tuple[QADocsResponse, list[DbSearchDoc], list[int] | None]:
    response_sumary = cast(SearchResponseSummary, packet.response)

+    is_extended = isinstance(packet, ExtendedToolResponse)
    dropped_inds = None
    if not selected_search_docs:
        top_docs = chunks_or_sections_to_search_docs(response_sumary.top_sections)

        deduped_docs = top_docs
-        if dedupe_docs:
+        if (
+            dedupe_docs and not is_extended
+        ):  # Extended tool responses are already deduped
            deduped_docs, dropped_inds = dedupe_documents(top_docs)

        reference_db_search_docs = [
@@ -178,6 +196,10 @@ def _handle_search_tool_response_summary(
        translate_db_search_doc_to_server_search_doc(db_search_doc)
        for db_search_doc in reference_db_search_docs
    ]
+
+    level, question_num = None, None
+    if isinstance(packet, ExtendedToolResponse):
+        level, question_num = packet.level, packet.level_question_num
    return (
        QADocsResponse(
            rephrased_query=response_sumary.rephrased_query,
@@ -187,6 +209,8 @@ def _handle_search_tool_response_summary(
            applied_source_filters=response_sumary.final_filters.source_type,
            applied_time_cutoff=response_sumary.final_filters.time_cutoff,
            recency_bias_multiplier=response_sumary.recency_bias_multiplier,
+            level=level,
+            level_question_num=question_num,
        ),
        reference_db_search_docs,
        dropped_inds,
@@ -282,6 +306,7 @@ ChatPacket = (
    | MessageSpecificCitations
    | MessageResponseIDInfo
    | StreamStopInfo
+    | AgentSearchPacket
 )
 ChatPacketStream = Iterator[ChatPacket]

@@ -324,6 +349,7 @@ def stream_chat_message_objects(
    new_msg_req.chunks_above = 0
    new_msg_req.chunks_below = 0

+    llm = None
    try:
        user_id = user.id if user is not None else None

@@ -502,11 +528,8 @@ def stream_chat_message_objects(
        files = load_all_chat_files(
            history_msgs, new_msg_req.file_descriptors, db_session
        )
-        latest_query_files = [
-            file
-            for file in files
-            if file.file_id in [f["id"] for f in new_msg_req.file_descriptors]
-        ]
+        req_file_ids = [f["id"] for f in new_msg_req.file_descriptors]
+        latest_query_files = [file for file in files if file.file_id in req_file_ids]

        if user_message:
            attach_files_to_chat_message(
@@ -679,13 +702,58 @@ def stream_chat_message_objects(
        for tool_list in tool_dict.values():
            tools.extend(tool_list)

+        # TODO: unify message history with single message history
+        message_history = [
+            PreviousMessage.from_chat_message(msg, files) for msg in history_msgs
+        ]
+
+        search_request = SearchRequest(
+            query=final_msg.message,
+            evaluation_type=(
+                LLMEvaluationType.BASIC
+                if persona.llm_relevance_filter
+                else LLMEvaluationType.SKIP
+            ),
+            human_selected_filters=(
+                retrieval_options.filters if retrieval_options else None
+            ),
+            persona=persona,
+            offset=(retrieval_options.offset if retrieval_options else None),
+            limit=retrieval_options.limit if retrieval_options else None,
+            rerank_settings=new_msg_req.rerank_settings,
+            chunks_above=new_msg_req.chunks_above,
+            chunks_below=new_msg_req.chunks_below,
+            full_doc=new_msg_req.full_doc,
+            enable_auto_detect_filters=(
+                retrieval_options.enable_auto_detect_filters
+                if retrieval_options
+                else None
+            ),
+        )
+
+        force_use_tool = _get_force_search_settings(new_msg_req, tools)
+        prompt_builder = AnswerPromptBuilder(
+            user_message=default_build_user_message(
+                user_query=final_msg.message,
+                prompt_config=prompt_config,
+                files=latest_query_files,
+                single_message_history=single_message_history,
+            ),
+            system_message=default_build_system_message(prompt_config),
+            message_history=message_history,
+            llm_config=llm.config,
+            raw_user_query=final_msg.message,
+            raw_user_uploaded_files=latest_query_files or [],
+            single_message_history=single_message_history,
+        )
+        prompt_builder.update_system_prompt(default_build_system_message(prompt_config))
+
        # LLM prompt building, response capturing, etc.
        answer = Answer(
+            prompt_builder=prompt_builder,
            is_connected=is_connected,
-            question=final_msg.message,
            latest_query_files=latest_query_files,
            answer_style_config=answer_style_config,
-            prompt_config=prompt_config,
            llm=(
                llm
                or get_main_llm_from_tuple(
@@ -698,28 +766,46 @@ def stream_chat_message_objects(
                    )
                )
            ),
-            message_history=[
-                PreviousMessage.from_chat_message(msg, files) for msg in history_msgs
-            ],
+            fast_llm=fast_llm,
+            force_use_tool=force_use_tool,
+            search_request=search_request,
+            chat_session_id=chat_session_id,
+            current_agent_message_id=reserved_message_id,
            tools=tools,
-            force_use_tool=_get_force_search_settings(new_msg_req, tools),
-            single_message_history=single_message_history,
+            db_session=db_session,
+            use_agentic_search=new_msg_req.use_agentic_search,
        )

-        reference_db_search_docs = None
-        qa_docs_response = None
-        # any files to associate with the AI message e.g. dall-e generated images
-        ai_message_files = []
-        dropped_indices = None
-        tool_result = None
+        # reference_db_search_docs = None
+        # qa_docs_response = None
+        # # any files to associate with the AI message e.g. dall-e generated images
+        # ai_message_files = []
+        # dropped_indices = None
+        # tool_result = None

+        # TODO: different channels for stored info when it's coming from the agent flow
+        info_by_subq: dict[SubQuestionKey, AnswerPostInfo] = defaultdict(
+            lambda: AnswerPostInfo(ai_message_files=[])
+        )
+        refined_answer_improvement = True
        for packet in answer.processed_streamed_output:
            if isinstance(packet, ToolResponse):
+                level, level_question_num = (
+                    (packet.level, packet.level_question_num)
+                    if isinstance(packet, ExtendedToolResponse)
+                    else BASIC_KEY
+                )
+                assert level is not None
+                assert level_question_num is not None
+                info = info_by_subq[
+                    SubQuestionKey(level=level, question_num=level_question_num)
+                ]
+                # TODO: don't need to dedupe here when we do it in agent flow
                if packet.id == SEARCH_RESPONSE_SUMMARY_ID:
                    (
-                        qa_docs_response,
-                        reference_db_search_docs,
-                        dropped_indices,
+                        info.qa_docs_response,
+                        info.reference_db_search_docs,
+                        info.dropped_indices,
                    ) = _handle_search_tool_response_summary(
                        packet=packet,
                        db_session=db_session,
@@ -731,29 +817,34 @@ def stream_chat_message_objects(
                            else False
                        ),
                    )
-                    yield qa_docs_response
+                    yield info.qa_docs_response
                elif packet.id == SECTION_RELEVANCE_LIST_ID:
                    relevance_sections = packet.response

-                    if reference_db_search_docs is not None:
-                        llm_indices = relevant_sections_to_indices(
-                            relevance_sections=relevance_sections,
-                            items=[
-                                translate_db_search_doc_to_server_search_doc(doc)
-                                for doc in reference_db_search_docs
-                            ],
+                    if info.reference_db_search_docs is None:
+                        logger.warning(
+                            "No reference docs found for relevance filtering"
+                        )
+                        continue
+
+                    llm_indices = relevant_sections_to_indices(
+                        relevance_sections=relevance_sections,
+                        items=[
+                            translate_db_search_doc_to_server_search_doc(doc)
+                            for doc in info.reference_db_search_docs
+                        ],
+                    )
+
+                    if info.dropped_indices:
+                        llm_indices = drop_llm_indices(
+                            llm_indices=llm_indices,
+                            search_docs=info.reference_db_search_docs,
+                            dropped_indices=info.dropped_indices,
                        )

-                        if dropped_indices:
-                            llm_indices = drop_llm_indices(
-                                llm_indices=llm_indices,
-                                search_docs=reference_db_search_docs,
-                                dropped_indices=dropped_indices,
-                            )
-
-                        yield LLMRelevanceFilterResponse(
-                            llm_selected_doc_indices=llm_indices
-                        )
+                    yield LLMRelevanceFilterResponse(
+                        llm_selected_doc_indices=llm_indices
+                    )
                elif packet.id == FINAL_CONTEXT_DOCUMENTS_ID:
                    yield FinalUsedContextDocsResponse(
                        final_context_docs=packet.response
@@ -773,22 +864,24 @@ def stream_chat_message_objects(
                        ],
                        tenant_id=tenant_id,
                    )
-                    ai_message_files = [
-                        FileDescriptor(id=str(file_id), type=ChatFileType.IMAGE)
-                        for file_id in file_ids
-                    ]
+                    info.ai_message_files.extend(
+                        [
+                            FileDescriptor(id=str(file_id), type=ChatFileType.IMAGE)
+                            for file_id in file_ids
+                        ]
+                    )
                    yield FileChatDisplay(
                        file_ids=[str(file_id) for file_id in file_ids]
                    )
                elif packet.id == INTERNET_SEARCH_RESPONSE_ID:
                    (
-                        qa_docs_response,
-                        reference_db_search_docs,
+                        info.qa_docs_response,
+                        info.reference_db_search_docs,
                    ) = _handle_internet_search_tool_response_summary(
                        packet=packet,
                        db_session=db_session,
                    )
-                    yield qa_docs_response
+                    yield info.qa_docs_response
                elif packet.id == CUSTOM_TOOL_RESPONSE_ID:
                    custom_tool_response = cast(CustomToolCallSummary, packet.response)

@@ -797,7 +890,7 @@ def stream_chat_message_objects(
                        or custom_tool_response.response_type == "csv"
                    ):
                        file_ids = custom_tool_response.tool_result.file_ids
-                        ai_message_files.extend(
+                        info.ai_message_files.extend(
                            [
                                FileDescriptor(
                                    id=str(file_id),
@@ -822,10 +915,23 @@ def stream_chat_message_objects(
                    yield cast(OnyxContexts, packet.response)

            elif isinstance(packet, StreamStopInfo):
-                pass
+                if packet.stop_reason == StreamStopReason.FINISHED:
+                    yield packet
+            elif isinstance(packet, RefinedAnswerImprovement):
+                refined_answer_improvement = packet.refined_answer_improvement
+                yield packet
            else:
                if isinstance(packet, ToolCallFinalResult):
-                    tool_result = packet
+                    level, level_question_num = (
+                        (packet.level, packet.level_question_num)
+                        if packet.level is not None
+                        and packet.level_question_num is not None
+                        else BASIC_KEY
+                    )
+                    info = info_by_subq[
+                        SubQuestionKey(level=level, question_num=level_question_num)
+                    ]
+                    info.tool_result = packet
                yield cast(ChatPacket, packet)
        logger.debug("Reached end of stream")
    except ValueError as e:
@@ -841,59 +947,108 @@ def stream_chat_message_objects(

        error_msg = str(e)
        stack_trace = traceback.format_exc()
-        client_error_msg = litellm_exception_to_error_msg(e, llm)
-        if llm.config.api_key and len(llm.config.api_key) > 2:
-            error_msg = error_msg.replace(llm.config.api_key, "[REDACTED_API_KEY]")
-            stack_trace = stack_trace.replace(llm.config.api_key, "[REDACTED_API_KEY]")
+        if llm:
+            client_error_msg = litellm_exception_to_error_msg(e, llm)
+            if llm.config.api_key and len(llm.config.api_key) > 2:
+                error_msg = error_msg.replace(llm.config.api_key, "[REDACTED_API_KEY]")
+                stack_trace = stack_trace.replace(
+                    llm.config.api_key, "[REDACTED_API_KEY]"
+                )

-        yield StreamingError(error=client_error_msg, stack_trace=stack_trace)
+            yield StreamingError(error=client_error_msg, stack_trace=stack_trace)
        db_session.rollback()
        return

    # Post-LLM answer processing
    try:
-        logger.debug("Post-LLM answer processing")
-        message_specific_citations: MessageSpecificCitations | None = None
-        if reference_db_search_docs:
-            message_specific_citations = _translate_citations(
-                citations_list=answer.citations,
-                db_docs=reference_db_search_docs,
-            )
-            if not answer.is_cancelled():
-                yield AllCitations(citations=answer.citations)
-
-        # Saving Gen AI answer and responding with message info
        tool_name_to_tool_id: dict[str, int] = {}
        for tool_id, tool_list in tool_dict.items():
            for tool in tool_list:
                tool_name_to_tool_id[tool.name] = tool_id

+        subq_citations = answer.citations_by_subquestion()
+        for subq_key in subq_citations:
+            info = info_by_subq[subq_key]
+            logger.debug("Post-LLM answer processing")
+            if info.reference_db_search_docs:
+                info.message_specific_citations = _translate_citations(
+                    citations_list=subq_citations[subq_key],
+                    db_docs=info.reference_db_search_docs,
+                )
+
+            # TODO: AllCitations should contain subq info?
+            if not answer.is_cancelled():
+                yield AllCitations(citations=subq_citations[subq_key])
+
+        # Saving Gen AI answer and responding with message info
+
+        basic_key = SubQuestionKey(level=BASIC_KEY[0], question_num=BASIC_KEY[1])
+        info = (
+            info_by_subq[basic_key]
+            if basic_key in info_by_subq
+            else info_by_subq[
+                SubQuestionKey(
+                    level=AGENT_SEARCH_INITIAL_KEY[0],
+                    question_num=AGENT_SEARCH_INITIAL_KEY[1],
+                )
+            ]
+        )
        gen_ai_response_message = partial_response(
            message=answer.llm_answer,
            rephrased_query=(
-                qa_docs_response.rephrased_query if qa_docs_response else None
+                info.qa_docs_response.rephrased_query if info.qa_docs_response else None
            ),
-            reference_docs=reference_db_search_docs,
-            files=ai_message_files,
+            reference_docs=info.reference_db_search_docs,
+            files=info.ai_message_files,
            token_count=len(llm_tokenizer_encode_func(answer.llm_answer)),
            citations=(
-                message_specific_citations.citation_map
-                if message_specific_citations
+                info.message_specific_citations.citation_map
+                if info.message_specific_citations
                else None
            ),
            error=None,
            tool_call=(
                ToolCall(
-                    tool_id=tool_name_to_tool_id[tool_result.tool_name],
-                    tool_name=tool_result.tool_name,
-                    tool_arguments=tool_result.tool_args,
-                    tool_result=tool_result.tool_result,
+                    tool_id=tool_name_to_tool_id[info.tool_result.tool_name],
+                    tool_name=info.tool_result.tool_name,
+                    tool_arguments=info.tool_result.tool_args,
+                    tool_result=info.tool_result.tool_result,
                )
-                if tool_result
+                if info.tool_result
                else None
            ),
        )

+        # add answers for levels >= 1, where each level has the previous as its parent. Use
+        # the answer_by_level method in answer.py to get the answers for each level
+        next_level = 1
+        prev_message = gen_ai_response_message
+        agent_answers = answer.llm_answer_by_level()
+        while next_level in agent_answers:
+            next_answer = agent_answers[next_level]
+            info = info_by_subq[
+                SubQuestionKey(
+                    level=next_level, question_num=AGENT_SEARCH_INITIAL_KEY[1]
+                )
+            ]
+            next_answer_message = create_new_chat_message(
+                chat_session_id=chat_session_id,
+                parent_message=prev_message,
+                message=next_answer,
+                prompt_id=None,
+                token_count=len(llm_tokenizer_encode_func(next_answer)),
+                message_type=MessageType.ASSISTANT,
+                db_session=db_session,
+                files=info.ai_message_files,
+                reference_docs=info.reference_db_search_docs,
+                citations=info.message_specific_citations.citation_map
+                if info.message_specific_citations
+                else None,
+                refined_answer_improvement=refined_answer_improvement,
+            )
+            next_level += 1
+            prev_message = next_answer_message
+
        logger.debug("Committing messages")
        db_session.commit()  # actually save user / assistant message

--- a/backend/onyx/chat/prompt_builder/answer_prompt_builder.py
+++ b/backend/onyx/chat/prompt_builder/answer_prompt_builder.py
@@ -4,6 +4,7 @@ from typing import cast
 from langchain_core.messages import BaseMessage
 from langchain_core.messages import HumanMessage
 from langchain_core.messages import SystemMessage
+from pydantic import BaseModel
 from pydantic.v1 import BaseModel as BaseModel__v1

 from onyx.chat.models import PromptConfig
@@ -84,6 +85,7 @@ class AnswerPromptBuilder:
        raw_user_query: str,
        raw_user_uploaded_files: list[InMemoryChatFile],
        single_message_history: str | None = None,
+        system_message: SystemMessage | None = None,
    ) -> None:
        self.max_tokens = compute_max_llm_input_tokens(llm_config)

@@ -108,7 +110,14 @@ class AnswerPromptBuilder:
            ),
        )

-        self.system_message_and_token_cnt: tuple[SystemMessage, int] | None = None
+        self.system_message_and_token_cnt: tuple[SystemMessage, int] | None = (
+            (
+                system_message,
+                check_message_tokens(system_message, self.llm_tokenizer_encode_func),
+            )
+            if system_message
+            else None
+        )
        self.user_message_and_token_cnt = (
            user_message,
            check_message_tokens(
@@ -174,6 +183,14 @@ class AnswerPromptBuilder:
        )


+# Stores some parts of a prompt builder as needed for tool calls
+class PromptSnapshot(BaseModel):
+    raw_message_history: list[PreviousMessage]
+    raw_user_query: str
+    built_prompt: list[BaseMessage]
+
+
+# TODO: rename this? AnswerConfig maybe?
 class LLMCall(BaseModel__v1):
    prompt_builder: AnswerPromptBuilder
    tools: list[Tool]
--- a/backend/onyx/chat/stream_processing/answer_response_handler.py
+++ b/backend/onyx/chat/stream_processing/answer_response_handler.py
@@ -3,9 +3,10 @@ from collections.abc import Generator

 from langchain_core.messages import BaseMessage

-from onyx.chat.llm_response_handler import ResponsePart
 from onyx.chat.models import CitationInfo
 from onyx.chat.models import LlmDoc
+from onyx.chat.models import OnyxAnswerPiece
+from onyx.chat.models import ResponsePart
 from onyx.chat.stream_processing.citation_processing import CitationProcessor
 from onyx.chat.stream_processing.utils import DocumentIdOrderMapping
 from onyx.utils.logger import setup_logger
@@ -13,21 +14,32 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+# TODO: remove update() once it is no longer needed
 class AnswerResponseHandler(abc.ABC):
    @abc.abstractmethod
    def handle_response_part(
        self,
-        response_item: BaseMessage | None,
-        previous_response_items: list[BaseMessage],
+        response_item: BaseMessage | str | None,
+        previous_response_items: list[BaseMessage | str],
    ) -> Generator[ResponsePart, None, None]:
        raise NotImplementedError


+class PassThroughAnswerResponseHandler(AnswerResponseHandler):
+    def handle_response_part(
+        self,
+        response_item: BaseMessage | str | None,
+        previous_response_items: list[BaseMessage | str],
+    ) -> Generator[ResponsePart, None, None]:
+        content = _message_to_str(response_item)
+        yield OnyxAnswerPiece(answer_piece=content)
+
+
 class DummyAnswerResponseHandler(AnswerResponseHandler):
    def handle_response_part(
        self,
-        response_item: BaseMessage | None,
-        previous_response_items: list[BaseMessage],
+        response_item: BaseMessage | str | None,
+        previous_response_items: list[BaseMessage | str],
    ) -> Generator[ResponsePart, None, None]:
        # This is a dummy handler that returns nothing
        yield from []
@@ -56,43 +68,25 @@ class CitationResponseHandler(AnswerResponseHandler):

    def handle_response_part(
        self,
-        response_item: BaseMessage | None,
-        previous_response_items: list[BaseMessage],
+        response_item: BaseMessage | str | None,
+        previous_response_items: list[BaseMessage | str],
    ) -> Generator[ResponsePart, None, None]:
        if response_item is None:
            return

-        content = (
-            response_item.content if isinstance(response_item.content, str) else ""
-        )
+        content = _message_to_str(response_item)

        # Process the new content through the citation processor
        yield from self.citation_processor.process_token(content)


-# No longer in use, remove later
-# class QuotesResponseHandler(AnswerResponseHandler):
-#     def __init__(
-#         self,
-#         context_docs: list[LlmDoc],
-#         is_json_prompt: bool = True,
-#     ):
-#         self.quotes_processor = QuotesProcessor(
-#             context_docs=context_docs,
-#             is_json_prompt=is_json_prompt,
-#         )
-
-#     def handle_response_part(
-#         self,
-#         response_item: BaseMessage | None,
-#         previous_response_items: list[BaseMessage],
-#     ) -> Generator[ResponsePart, None, None]:
-#         if response_item is None:
-#             yield from self.quotes_processor.process_token(None)
-#             return
-
-#         content = (
-#             response_item.content if isinstance(response_item.content, str) else ""
-#         )
-
-#         yield from self.quotes_processor.process_token(content)
+def _message_to_str(message: BaseMessage | str | None) -> str:
+    if message is None:
+        return ""
+    if isinstance(message, str):
+        return message
+    content = message.content if isinstance(message, BaseMessage) else message
+    if not isinstance(content, str):
+        logger.warning(f"Received non-string content: {type(content)}")
+        content = str(content) if content is not None else ""
+    return content
--- a/backend/onyx/chat/tool_handling/tool_response_handler.py
+++ b/backend/onyx/chat/tool_handling/tool_response_handler.py
@@ -5,7 +5,9 @@ from langchain_core.messages import BaseMessage
 from langchain_core.messages import ToolCall

 from onyx.chat.models import ResponsePart
+from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
 from onyx.chat.prompt_builder.answer_prompt_builder import LLMCall
+from onyx.chat.prompt_builder.answer_prompt_builder import PromptSnapshot
 from onyx.llm.interfaces import LLM
 from onyx.tools.force import ForceUseTool
 from onyx.tools.message import build_tool_message
@@ -25,6 +27,13 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+def get_tool_by_name(tools: list[Tool], tool_name: str) -> Tool:
+    for tool in tools:
+        if tool.name == tool_name:
+            return tool
+    raise RuntimeError(f"Tool '{tool_name}' not found")
+
+
 class ToolResponseHandler:
    def __init__(self, tools: list[Tool]):
        self.tools = tools
@@ -43,67 +52,12 @@ class ToolResponseHandler:
    def get_tool_call_for_non_tool_calling_llm(
        cls, llm_call: LLMCall, llm: LLM
    ) -> tuple[Tool, dict] | None:
-        if llm_call.force_use_tool.force_use:
-            # if we are forcing a tool, we don't need to check which tools to run
-            tool = next(
-                (
-                    t
-                    for t in llm_call.tools
-                    if t.name == llm_call.force_use_tool.tool_name
-                ),
-                None,
-            )
-            if not tool:
-                raise RuntimeError(
-                    f"Tool '{llm_call.force_use_tool.tool_name}' not found"
-                )
-
-            tool_args = (
-                llm_call.force_use_tool.args
-                if llm_call.force_use_tool.args is not None
-                else tool.get_args_for_non_tool_calling_llm(
-                    query=llm_call.prompt_builder.raw_user_query,
-                    history=llm_call.prompt_builder.raw_message_history,
-                    llm=llm,
-                    force_run=True,
-                )
-            )
-
-            if tool_args is None:
-                raise RuntimeError(f"Tool '{tool.name}' did not return args")
-
-            return (tool, tool_args)
-        else:
-            tool_options = check_which_tools_should_run_for_non_tool_calling_llm(
-                tools=llm_call.tools,
-                query=llm_call.prompt_builder.raw_user_query,
-                history=llm_call.prompt_builder.raw_message_history,
-                llm=llm,
-            )
-
-            available_tools_and_args = [
-                (llm_call.tools[ind], args)
-                for ind, args in enumerate(tool_options)
-                if args is not None
-            ]
-
-            logger.info(
-                f"Selecting single tool from tools: {[(tool.name, args) for tool, args in available_tools_and_args]}"
-            )
-
-            chosen_tool_and_args = (
-                select_single_tool_for_non_tool_calling_llm(
-                    tools_and_args=available_tools_and_args,
-                    history=llm_call.prompt_builder.raw_message_history,
-                    query=llm_call.prompt_builder.raw_user_query,
-                    llm=llm,
-                )
-                if available_tools_and_args
-                else None
-            )
-
-            logger.notice(f"Chosen tool: {chosen_tool_and_args}")
-            return chosen_tool_and_args
+        return get_tool_call_for_non_tool_calling_llm_impl(
+            force_use_tool=llm_call.force_use_tool,
+            tools=llm_call.tools,
+            prompt_builder=llm_call.prompt_builder,
+            llm=llm,
+        )

    def _handle_tool_call(self) -> Generator[ResponsePart, None, None]:
        if not self.tool_call_chunk or not self.tool_call_chunk.tool_calls:
@@ -118,20 +72,17 @@ class ToolResponseHandler:
                tool for tool in self.tools if tool.name == tool_call_request["name"]
            ]

-            if not known_tools_by_name:
-                logger.error(
-                    "Tool call requested with unknown name field. \n"
-                    f"self.tools: {self.tools}"
-                    f"tool_call_request: {tool_call_request}"
-                )
-                continue
-            else:
+            if known_tools_by_name:
                selected_tool = known_tools_by_name[0]
                selected_tool_call_request = tool_call_request
-
-            if selected_tool and selected_tool_call_request:
                break

+            logger.error(
+                "Tool call requested with unknown name field. \n"
+                f"self.tools: {self.tools}"
+                f"tool_call_request: {tool_call_request}"
+            )
+
        if not selected_tool or not selected_tool_call_request:
            return

@@ -157,8 +108,8 @@ class ToolResponseHandler:

    def handle_response_part(
        self,
-        response_item: BaseMessage | None,
-        previous_response_items: list[BaseMessage],
+        response_item: BaseMessage | str | None,
+        previous_response_items: list[BaseMessage | str],
    ) -> Generator[ResponsePart, None, None]:
        if response_item is None:
            yield from self._handle_tool_call()
@@ -171,8 +122,6 @@ class ToolResponseHandler:
            else:
                self.tool_call_chunk += response_item  # type: ignore

-        return
-
    def next_llm_call(self, current_llm_call: LLMCall) -> LLMCall | None:
        if (
            self.tool_runner is None
@@ -205,3 +154,61 @@ class ToolResponseHandler:
                self.tool_final_result,
            ],
        )
+
+
+def get_tool_call_for_non_tool_calling_llm_impl(
+    force_use_tool: ForceUseTool,
+    tools: list[Tool],
+    prompt_builder: AnswerPromptBuilder | PromptSnapshot,
+    llm: LLM,
+) -> tuple[Tool, dict] | None:
+    if force_use_tool.force_use:
+        # if we are forcing a tool, we don't need to check which tools to run
+        tool = get_tool_by_name(tools, force_use_tool.tool_name)
+
+        tool_args = (
+            force_use_tool.args
+            if force_use_tool.args is not None
+            else tool.get_args_for_non_tool_calling_llm(
+                query=prompt_builder.raw_user_query,
+                history=prompt_builder.raw_message_history,
+                llm=llm,
+                force_run=True,
+            )
+        )
+
+        if tool_args is None:
+            raise RuntimeError(f"Tool '{tool.name}' did not return args")
+
+        return (tool, tool_args)
+    else:
+        tool_options = check_which_tools_should_run_for_non_tool_calling_llm(
+            tools=tools,
+            query=prompt_builder.raw_user_query,
+            history=prompt_builder.raw_message_history,
+            llm=llm,
+        )
+
+        available_tools_and_args = [
+            (tools[ind], args)
+            for ind, args in enumerate(tool_options)
+            if args is not None
+        ]
+
+        logger.info(
+            f"Selecting single tool from tools: {[(tool.name, args) for tool, args in available_tools_and_args]}"
+        )
+
+        chosen_tool_and_args = (
+            select_single_tool_for_non_tool_calling_llm(
+                tools_and_args=available_tools_and_args,
+                history=prompt_builder.raw_message_history,
+                query=prompt_builder.raw_user_query,
+                llm=llm,
+            )
+            if available_tools_and_args
+            else None
+        )
+
+        logger.notice(f"Chosen tool: {chosen_tool_and_args}")
+        return chosen_tool_and_args
--- a/backend/onyx/configs/agent_configs.py
+++ b/backend/onyx/configs/agent_configs.py
@@ -0,0 +1,80 @@
+import os
+
+INITIAL_SEARCH_DECOMPOSITION_ENABLED = True
+ALLOW_REFINEMENT = True
+
+AGENT_DEFAULT_RETRIEVAL_HITS = 15
+AGENT_DEFAULT_RERANKING_HITS = 10
+AGENT_DEFAULT_SUB_QUESTION_MAX_CONTEXT_HITS = 8
+AGENT_DEFAULT_NUM_DOCS_FOR_INITIAL_DECOMPOSITION = 3
+AGENT_DEFAULT_NUM_DOCS_FOR_REFINED_DECOMPOSITION = 5
+AGENT_DEFAULT_EXPLORATORY_SEARCH_RESULTS = 5
+AGENT_DEFAULT_MIN_ORIG_QUESTION_DOCS = 3
+AGENT_DEFAULT_MAX_ANSWER_CONTEXT_DOCS = 10
+AGENT_DEFAULT_MAX_STATIC_HISTORY_WORD_LENGTH = 2000
+
+#####
+# Agent Configs
+#####
+
+
+AGENT_RETRIEVAL_STATS = (
+    not os.environ.get("AGENT_RETRIEVAL_STATS") == "False"
+) or True  # default True
+
+
+AGENT_MAX_QUERY_RETRIEVAL_RESULTS = int(
+    os.environ.get("AGENT_MAX_QUERY_RETRIEVAL_RESULTS") or AGENT_DEFAULT_RETRIEVAL_HITS
+)  # 15
+
+AGENT_MAX_QUERY_RETRIEVAL_RESULTS = int(
+    os.environ.get("AGENT_MAX_QUERY_RETRIEVAL_RESULTS") or AGENT_DEFAULT_RETRIEVAL_HITS
+)  # 15
+
+# Reranking agent configs
+# Reranking stats - no influence on flow outside of stats collection
+AGENT_RERANKING_STATS = (
+    not os.environ.get("AGENT_RERANKING_STATS") == "True"
+) or False  # default False
+
+AGENT_MAX_QUERY_RETRIEVAL_RESULTS = int(
+    os.environ.get("AGENT_MAX_QUERY_RETRIEVAL_RESULTS") or AGENT_DEFAULT_RETRIEVAL_HITS
+)  # 15
+
+AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS = int(
+    os.environ.get("AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS")
+    or AGENT_DEFAULT_RERANKING_HITS
+)  # 10
+
+AGENT_NUM_DOCS_FOR_DECOMPOSITION = int(
+    os.environ.get("AGENT_NUM_DOCS_FOR_DECOMPOSITION")
+    or AGENT_DEFAULT_NUM_DOCS_FOR_INITIAL_DECOMPOSITION
+)  # 3
+
+AGENT_NUM_DOCS_FOR_REFINED_DECOMPOSITION = int(
+    os.environ.get("AGENT_NUM_DOCS_FOR_REFINED_DECOMPOSITION")
+    or AGENT_DEFAULT_NUM_DOCS_FOR_REFINED_DECOMPOSITION
+)  # 5
+
+AGENT_EXPLORATORY_SEARCH_RESULTS = int(
+    os.environ.get("AGENT_EXPLORATORY_SEARCH_RESULTS")
+    or AGENT_DEFAULT_EXPLORATORY_SEARCH_RESULTS
+)  # 5
+
+AGENT_MIN_ORIG_QUESTION_DOCS = int(
+    os.environ.get("AGENT_MIN_ORIG_QUESTION_DOCS")
+    or AGENT_DEFAULT_MIN_ORIG_QUESTION_DOCS
+)  # 3
+
+AGENT_MAX_ANSWER_CONTEXT_DOCS = int(
+    os.environ.get("AGENT_MAX_ANSWER_CONTEXT_DOCS")
+    or AGENT_DEFAULT_SUB_QUESTION_MAX_CONTEXT_HITS
+)  # 8
+
+
+AGENT_MAX_STATIC_HISTORY_WORD_LENGTH = int(
+    os.environ.get("AGENT_MAX_STATIC_HISTORY_WORD_LENGTH")
+    or AGENT_DEFAULT_MAX_STATIC_HISTORY_WORD_LENGTH
+)  # 2000
+
+GRAPH_VERSION_NAME: str = "a"
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -38,6 +38,13 @@ DEFAULT_PERSONA_ID = 0

 DEFAULT_CC_PAIR_ID = 1

+# subquestion level and question number for basic flow
+BASIC_KEY = (-1, -1)
+AGENT_SEARCH_INITIAL_KEY = (0, 0)
+CANCEL_CHECK_INTERVAL = 20
+DISPATCH_SEP_CHAR = "\n"
+FORMAT_DOCS_SEPARATOR = "\n\n"
+NUM_EXPLORATORY_DOCS = 15
 # Postgres connection constants for application_name
 POSTGRES_WEB_APP_NAME = "web"
 POSTGRES_INDEXER_APP_NAME = "indexer"
--- a/backend/onyx/context/search/pipeline.py
+++ b/backend/onyx/context/search/pipeline.py
@@ -24,7 +24,9 @@ from onyx.context.search.models import SearchRequest
 from onyx.context.search.postprocessing.postprocessing import cleanup_chunks
 from onyx.context.search.postprocessing.postprocessing import search_postprocessing
 from onyx.context.search.preprocessing.preprocessing import retrieval_preprocessing
-from onyx.context.search.retrieval.search_runner import retrieve_chunks
+from onyx.context.search.retrieval.search_runner import (
+    retrieve_chunks,
+)
 from onyx.context.search.utils import inference_section_from_chunks
 from onyx.context.search.utils import relevant_sections_to_indices
 from onyx.db.models import User
@@ -54,6 +56,8 @@ class SearchPipeline:
        retrieval_metrics_callback: (
            Callable[[RetrievalMetricsContainer], None] | None
        ) = None,
+        retrieved_sections_callback: Callable[[list[InferenceSection]], None]
+        | None = None,
        rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
        prompt_config: PromptConfig | None = None,
    ):
@@ -78,6 +82,8 @@ class SearchPipeline:
        self._retrieved_chunks: list[InferenceChunk] | None = None
        # Another call made to the document index to get surrounding sections
        self._retrieved_sections: list[InferenceSection] | None = None
+
+        self.retrieved_sections_callback = retrieved_sections_callback
        # Reranking and LLM section selection can be run together
        # If only LLM selection is on, the reranked chunks are yielded immediatly
        self._reranked_sections: list[InferenceSection] | None = None
@@ -326,9 +332,13 @@ class SearchPipeline:
        if self._reranked_sections is not None:
            return self._reranked_sections

+        retrieved_sections = self._get_sections()
+        if self.retrieved_sections_callback is not None:
+            self.retrieved_sections_callback(retrieved_sections)
+
        self._postprocessing_generator = search_postprocessing(
            search_query=self.search_query,
-            retrieved_sections=self._get_sections(),
+            retrieved_sections=retrieved_sections,
            llm=self.fast_llm,
            rerank_metrics_callback=self.rerank_metrics_callback,
        )
@@ -403,8 +413,18 @@ class SearchPipeline:

    @property
    def section_relevance_list(self) -> list[bool]:
-        llm_indices = relevant_sections_to_indices(
-            relevance_sections=self.section_relevance,
-            items=self.final_context_sections,
+        return section_relevance_list_impl(
+            section_relevance=self.section_relevance,
+            final_context_sections=self.final_context_sections,
        )
-        return [ind in llm_indices for ind in range(len(self.final_context_sections))]
+
+
+def section_relevance_list_impl(
+    section_relevance: list[SectionRelevancePiece] | None,
+    final_context_sections: list[InferenceSection],
+) -> list[bool]:
+    llm_indices = relevant_sections_to_indices(
+        relevance_sections=section_relevance,
+        items=final_context_sections,
+    )
+    return [ind in llm_indices for ind in range(len(final_context_sections))]
--- a/backend/onyx/context/search/utils.py
+++ b/backend/onyx/context/search/utils.py
@@ -80,7 +80,7 @@ def drop_llm_indices(
    search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
    dropped_indices: list[int],
 ) -> list[int]:
-    llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
+    llm_bools = [i in llm_indices for i in range(len(search_docs))]
    if dropped_indices:
        llm_bools = [
            val for ind, val in enumerate(llm_bools) if ind not in dropped_indices
--- a/backend/onyx/db/chat.py
+++ b/backend/onyx/db/chat.py
@@ -1,6 +1,8 @@
 from collections.abc import Sequence
 from datetime import datetime
 from datetime import timedelta
+from typing import Any
+from typing import cast
 from uuid import UUID

 from fastapi import HTTPException
@@ -15,13 +17,22 @@ from sqlalchemy.exc import MultipleResultsFound
 from sqlalchemy.orm import joinedload
 from sqlalchemy.orm import Session

+from onyx.agents.agent_search.shared_graph_utils.models import CombinedAgentMetrics
+from onyx.agents.agent_search.shared_graph_utils.models import (
+    SubQuestionAnswerResults,
+)
 from onyx.auth.schemas import UserRole
 from onyx.chat.models import DocumentRelevance
 from onyx.configs.chat_configs import HARD_DELETE_CHATS
 from onyx.configs.constants import MessageType
+from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import RetrievalDocs
 from onyx.context.search.models import SavedSearchDoc
 from onyx.context.search.models import SearchDoc as ServerSearchDoc
+from onyx.context.search.utils import chunks_or_sections_to_search_docs
+from onyx.db.models import AgentSearchMetrics
+from onyx.db.models import AgentSubQuery
+from onyx.db.models import AgentSubQuestion
 from onyx.db.models import ChatMessage
 from onyx.db.models import ChatMessage__SearchDoc
 from onyx.db.models import ChatSession
@@ -37,9 +48,11 @@ from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import LLMOverride
 from onyx.llm.override_models import PromptOverride
 from onyx.server.query_and_chat.models import ChatMessageDetail
+from onyx.server.query_and_chat.models import SubQueryDetail
+from onyx.server.query_and_chat.models import SubQuestionDetail
 from onyx.tools.tool_runner import ToolCallFinalResult
 from onyx.utils.logger import setup_logger
-
+from onyx.utils.special_types import JSON_ro

 logger = setup_logger()

@@ -496,6 +509,7 @@ def get_chat_messages_by_session(
    prefetch_tool_calls: bool = False,
 ) -> list[ChatMessage]:
    if not skip_permission_check:
+        # bug if we ever call this expecting the permission check to not be skipped
        get_chat_session_by_id(
            chat_session_id=chat_session_id, user_id=user_id, db_session=db_session
        )
@@ -507,7 +521,12 @@ def get_chat_messages_by_session(
    )

    if prefetch_tool_calls:
-        stmt = stmt.options(joinedload(ChatMessage.tool_call))
+        stmt = stmt.options(
+            joinedload(ChatMessage.tool_call),
+            joinedload(ChatMessage.sub_questions).joinedload(
+                AgentSubQuestion.sub_queries
+            ),
+        )
        result = db_session.scalars(stmt).unique().all()
    else:
        result = db_session.scalars(stmt).all()
@@ -597,6 +616,7 @@ def create_new_chat_message(
    commit: bool = True,
    reserved_message_id: int | None = None,
    overridden_model: str | None = None,
+    refined_answer_improvement: bool = True,
 ) -> ChatMessage:
    if reserved_message_id is not None:
        # Edit existing message
@@ -617,6 +637,7 @@ def create_new_chat_message(
        existing_message.error = error
        existing_message.alternate_assistant_id = alternate_assistant_id
        existing_message.overridden_model = overridden_model
+        existing_message.refined_answer_improvement = refined_answer_improvement

        new_chat_message = existing_message
    else:
@@ -636,6 +657,7 @@ def create_new_chat_message(
            error=error,
            alternate_assistant_id=alternate_assistant_id,
            overridden_model=overridden_model,
+            refined_answer_improvement=refined_answer_improvement,
        )
        db_session.add(new_chat_message)

@@ -837,14 +859,54 @@ def translate_db_search_doc_to_server_search_doc(
    )


-def get_retrieval_docs_from_chat_message(
-    chat_message: ChatMessage, remove_doc_content: bool = False
+def translate_db_sub_questions_to_server_objects(
+    db_sub_questions: list[AgentSubQuestion],
+) -> list[SubQuestionDetail]:
+    sub_questions = []
+    for sub_question in db_sub_questions:
+        sub_queries = []
+        docs: dict[str, SearchDoc] = {}
+        doc_results = cast(
+            list[dict[str, JSON_ro]], sub_question.sub_question_doc_results
+        )
+        verified_doc_ids = [x["document_id"] for x in doc_results]
+        for sub_query in sub_question.sub_queries:
+            doc_ids = [doc.id for doc in sub_query.search_docs]
+            sub_queries.append(
+                SubQueryDetail(
+                    query=sub_query.sub_query,
+                    query_id=sub_query.id,
+                    doc_ids=doc_ids,
+                )
+            )
+            for doc in sub_query.search_docs:
+                docs[doc.document_id] = doc
+
+        verified_docs = [
+            docs[cast(str, doc_id)] for doc_id in verified_doc_ids if doc_id in docs
+        ]
+
+        sub_questions.append(
+            SubQuestionDetail(
+                level=sub_question.level,
+                level_question_num=sub_question.level_question_num,
+                question=sub_question.sub_question,
+                answer=sub_question.sub_answer,
+                sub_queries=sub_queries,
+                context_docs=get_retrieval_docs_from_search_docs(verified_docs),
+            )
+        )
+    return sub_questions
+
+
+def get_retrieval_docs_from_search_docs(
+    search_docs: list[SearchDoc], remove_doc_content: bool = False
 ) -> RetrievalDocs:
    top_documents = [
        translate_db_search_doc_to_server_search_doc(
            db_doc, remove_doc_content=remove_doc_content
        )
-        for db_doc in chat_message.search_docs
+        for db_doc in search_docs
    ]
    top_documents = sorted(top_documents, key=lambda doc: doc.score, reverse=True)  # type: ignore
    return RetrievalDocs(top_documents=top_documents)
@@ -861,8 +923,8 @@ def translate_db_message_to_chat_message_detail(
        latest_child_message=chat_message.latest_child_message,
        message=chat_message.message,
        rephrased_query=chat_message.rephrased_query,
-        context_docs=get_retrieval_docs_from_chat_message(
-            chat_message, remove_doc_content=remove_doc_content
+        context_docs=get_retrieval_docs_from_search_docs(
+            chat_message.search_docs, remove_doc_content=remove_doc_content
        ),
        message_type=chat_message.message_type,
        time_sent=chat_message.time_sent,
@@ -877,6 +939,121 @@ def translate_db_message_to_chat_message_detail(
        else None,
        alternate_assistant_id=chat_message.alternate_assistant_id,
        overridden_model=chat_message.overridden_model,
+        sub_questions=translate_db_sub_questions_to_server_objects(
+            chat_message.sub_questions
+        ),
+        refined_answer_improvement=chat_message.refined_answer_improvement,
    )

    return chat_msg_detail
+
+
+def log_agent_metrics(
+    db_session: Session,
+    user_id: UUID | None,
+    persona_id: int | None,  # Can be none if temporary persona is used
+    agent_type: str,
+    start_time: datetime | None,
+    agent_metrics: CombinedAgentMetrics,
+) -> AgentSearchMetrics:
+    agent_timings = agent_metrics.timings
+    agent_base_metrics = agent_metrics.base_metrics
+    agent_refined_metrics = agent_metrics.refined_metrics
+    agent_additional_metrics = agent_metrics.additional_metrics
+
+    agent_metric_tracking = AgentSearchMetrics(
+        user_id=user_id,
+        persona_id=persona_id,
+        agent_type=agent_type,
+        start_time=start_time,
+        base_duration_s=agent_timings.base_duration_s,
+        full_duration_s=agent_timings.full_duration_s,
+        base_metrics=vars(agent_base_metrics) if agent_base_metrics else None,
+        refined_metrics=vars(agent_refined_metrics) if agent_refined_metrics else None,
+        all_metrics=vars(agent_additional_metrics)
+        if agent_additional_metrics
+        else None,
+    )
+
+    db_session.add(agent_metric_tracking)
+    db_session.flush()
+
+    return agent_metric_tracking
+
+
+def log_agent_sub_question_results(
+    db_session: Session,
+    chat_session_id: UUID | None,
+    primary_message_id: int | None,
+    sub_question_answer_results: list[SubQuestionAnswerResults],
+) -> None:
+    def _create_citation_format_list(
+        document_citations: list[InferenceSection],
+    ) -> list[dict[str, Any]]:
+        citation_list: list[dict[str, Any]] = []
+        for document_citation in document_citations:
+            document_citation_dict = {
+                "link": "",
+                "blurb": document_citation.center_chunk.blurb,
+                "content": document_citation.center_chunk.content,
+                "metadata": document_citation.center_chunk.metadata,
+                "updated_at": str(document_citation.center_chunk.updated_at),
+                "document_id": document_citation.center_chunk.document_id,
+                "source_type": "file",
+                "source_links": document_citation.center_chunk.source_links,
+                "match_highlights": document_citation.center_chunk.match_highlights,
+                "semantic_identifier": document_citation.center_chunk.semantic_identifier,
+            }
+
+            citation_list.append(document_citation_dict)
+
+        return citation_list
+
+    now = datetime.now()
+
+    for sub_question_answer_result in sub_question_answer_results:
+        level, level_question_num = [
+            int(x) for x in sub_question_answer_result.question_id.split("_")
+        ]
+        sub_question = sub_question_answer_result.question
+        sub_answer = sub_question_answer_result.answer
+        sub_document_results = _create_citation_format_list(
+            sub_question_answer_result.verified_reranked_documents
+        )
+
+        sub_question_object = AgentSubQuestion(
+            chat_session_id=chat_session_id,
+            primary_question_id=primary_message_id,
+            level=level,
+            level_question_num=level_question_num,
+            sub_question=sub_question,
+            sub_answer=sub_answer,
+            sub_question_doc_results=sub_document_results,
+        )
+
+        db_session.add(sub_question_object)
+        db_session.commit()
+
+        sub_question_id = sub_question_object.id
+
+        for sub_query in sub_question_answer_result.sub_query_retrieval_results:
+            sub_query_object = AgentSubQuery(
+                parent_question_id=sub_question_id,
+                chat_session_id=chat_session_id,
+                sub_query=sub_query.query,
+                time_created=now,
+            )
+
+            db_session.add(sub_query_object)
+            db_session.commit()
+
+            search_docs = chunks_or_sections_to_search_docs(
+                sub_query.retrieved_documents
+            )
+            for doc in search_docs:
+                db_doc = create_db_search_doc(doc, db_session)
+                db_session.add(db_doc)
+                sub_query_object.search_docs.append(db_doc)
+            db_session.commit()
+
+    return None
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -325,6 +325,17 @@ class ChatMessage__SearchDoc(Base):
    )


+class AgentSubQuery__SearchDoc(Base):
+    __tablename__ = "agent__sub_query__search_doc"
+
+    sub_query_id: Mapped[int] = mapped_column(
+        ForeignKey("agent__sub_query.id"), primary_key=True
+    )
+    search_doc_id: Mapped[int] = mapped_column(
+        ForeignKey("search_doc.id"), primary_key=True
+    )
+
+
 class Document__Tag(Base):
    __tablename__ = "document__tag"

@@ -1048,6 +1059,11 @@ class SearchDoc(Base):
        secondary=ChatMessage__SearchDoc.__table__,
        back_populates="search_docs",
    )
+    sub_queries = relationship(
+        "AgentSubQuery",
+        secondary=AgentSubQuery__SearchDoc.__table__,
+        back_populates="search_docs",
+    )


 class ToolCall(Base):
@@ -1188,6 +1204,8 @@ class ChatMessage(Base):
        DateTime(timezone=True), server_default=func.now()
    )

+    refined_answer_improvement: Mapped[bool] = mapped_column(Boolean, nullable=True)
+
    chat_session: Mapped[ChatSession] = relationship("ChatSession")
    prompt: Mapped[Optional["Prompt"]] = relationship("Prompt")

@@ -1214,6 +1232,11 @@ class ChatMessage(Base):
        uselist=False,
    )

+    sub_questions: Mapped[list["AgentSubQuestion"]] = relationship(
+        "AgentSubQuestion",
+        back_populates="primary_message",
+    )
+
    standard_answers: Mapped[list["StandardAnswer"]] = relationship(
        "StandardAnswer",
        secondary=ChatMessage__StandardAnswer.__table__,
@@ -1248,6 +1271,71 @@ class ChatFolder(Base):
        return self.display_priority < other.display_priority


+class AgentSubQuestion(Base):
+    """
+    A sub-question is a question that is asked of the LLM to gather supporting
+    information to answer a primary question.
+    """
+
+    __tablename__ = "agent__sub_question"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    primary_question_id: Mapped[int] = mapped_column(ForeignKey("chat_message.id"))
+    chat_session_id: Mapped[UUID] = mapped_column(
+        PGUUID(as_uuid=True), ForeignKey("chat_session.id")
+    )
+    sub_question: Mapped[str] = mapped_column(Text)
+    level: Mapped[int] = mapped_column(Integer)
+    level_question_num: Mapped[int] = mapped_column(Integer)
+    time_created: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now()
+    )
+    sub_answer: Mapped[str] = mapped_column(Text)
+    sub_question_doc_results: Mapped[JSON_ro] = mapped_column(postgresql.JSONB())
+
+    # Relationships
+    primary_message: Mapped["ChatMessage"] = relationship(
+        "ChatMessage",
+        foreign_keys=[primary_question_id],
+        back_populates="sub_questions",
+    )
+    chat_session: Mapped["ChatSession"] = relationship("ChatSession")
+    sub_queries: Mapped[list["AgentSubQuery"]] = relationship(
+        "AgentSubQuery", back_populates="parent_question"
+    )
+
+
+class AgentSubQuery(Base):
+    """
+    A sub-query is a vector DB query that gathers supporting information to answer a sub-question.
+    """
+
+    __tablename__ = "agent__sub_query"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    parent_question_id: Mapped[int] = mapped_column(
+        ForeignKey("agent__sub_question.id")
+    )
+    chat_session_id: Mapped[UUID] = mapped_column(
+        PGUUID(as_uuid=True), ForeignKey("chat_session.id")
+    )
+    sub_query: Mapped[str] = mapped_column(Text)
+    time_created: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now()
+    )
+
+    # Relationships
+    parent_question: Mapped["AgentSubQuestion"] = relationship(
+        "AgentSubQuestion", back_populates="sub_queries"
+    )
+    chat_session: Mapped["ChatSession"] = relationship("ChatSession")
+    search_docs: Mapped[list["SearchDoc"]] = relationship(
+        "SearchDoc",
+        secondary=AgentSubQuery__SearchDoc.__table__,
+        back_populates="sub_queries",
+    )
+
+
 """
 Feedback, Logging, Metrics Tables
 """
@@ -1751,6 +1839,25 @@ class PGFileStore(Base):
    lobj_oid: Mapped[int] = mapped_column(Integer, nullable=False)


+class AgentSearchMetrics(Base):
+    __tablename__ = "agent__search_metrics"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    user_id: Mapped[UUID | None] = mapped_column(
+        ForeignKey("user.id", ondelete="CASCADE"), nullable=True
+    )
+    persona_id: Mapped[int | None] = mapped_column(
+        ForeignKey("persona.id"), nullable=True
+    )
+    agent_type: Mapped[str] = mapped_column(String)
+    start_time: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
+    base_duration_s: Mapped[float] = mapped_column(Float)
+    full_duration_s: Mapped[float] = mapped_column(Float)
+    base_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
+    refined_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
+    all_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
+
+
 """
 ************************************************************************
 Enterprise Edition Models
--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -404,7 +404,7 @@ class DefaultMultiLLM(LLM):
                # streaming choice
                stream=stream,
                # model params
-                temperature=self._temperature,
+                temperature=0,
                timeout=self._timeout,
                # For now, we don't support parallel tool calls
                # NOTE: we can't pass this in if tools are not specified
--- a/backend/onyx/llm/utils.py
+++ b/backend/onyx/llm/utils.py
@@ -440,14 +440,14 @@ def get_llm_max_tokens(

        if "max_input_tokens" in model_obj:
            max_tokens = model_obj["max_input_tokens"]
-            logger.info(
+            logger.debug(
                f"Max tokens for {model_name}: {max_tokens} (from max_input_tokens)"
            )
            return max_tokens

        if "max_tokens" in model_obj:
            max_tokens = model_obj["max_tokens"]
-            logger.info(f"Max tokens for {model_name}: {max_tokens} (from max_tokens)")
+            logger.debug(f"Max tokens for {model_name}: {max_tokens} (from max_tokens)")
            return max_tokens

        logger.error(f"No max tokens found for LLM: {model_name}")
--- a/backend/onyx/onyxbot/slack/listener.py
+++ b/backend/onyx/onyxbot/slack/listener.py
@@ -30,7 +30,9 @@ from onyx.configs.onyxbot_configs import DANSWER_BOT_REPHRASE_MESSAGE
 from onyx.configs.onyxbot_configs import DANSWER_BOT_RESPOND_EVERY_CHANNEL
 from onyx.configs.onyxbot_configs import NOTIFY_SLACKBOT_NO_ANSWER
 from onyx.connectors.slack.utils import expert_info_from_slack_id
-from onyx.context.search.retrieval.search_runner import download_nltk_data
+from onyx.context.search.retrieval.search_runner import (
+    download_nltk_data,
+)
 from onyx.db.engine import get_all_tenant_ids
 from onyx.db.engine import get_session_with_tenant
 from onyx.db.models import SlackBot
--- a/backend/onyx/prompts/agent_search.py
+++ b/backend/onyx/prompts/agent_search.py
@@ -0,0 +1,542 @@
+# Standards
+SEPARATOR_LINE = "-------"
+SEPARATOR_LINE_LONG = "---------------"
+UNKNOWN_ANSWER = "I do not have enough information to answer this question."
+NO_RECOVERED_DOCS = "No relevant information recovered"
+YES = "yes"
+NO = "no"
+
+
+# Framing/Support/Template Prompts
+HISTORY_FRAMING_PROMPT = f"""
+For more context, here is the history of the conversation so far that preceded this question:
+{SEPARATOR_LINE}
+{{history}}
+{SEPARATOR_LINE}
+""".strip()
+
+
+ASSISTANT_SYSTEM_PROMPT_DEFAULT = "You are an assistant for question-answering tasks."
+
+ASSISTANT_SYSTEM_PROMPT_PERSONA = f"""
+You are an assistant for question-answering tasks. Here is more information about you:
+{SEPARATOR_LINE}
+{{persona_prompt}}
+{SEPARATOR_LINE}
+""".strip()
+
+
+SUB_QUESTION_ANSWER_TEMPLATE = f"""
+Sub-Question: Q{{sub_question_num}}
+Question:
+{{sub_question}}
+{SEPARATOR_LINE}
+Answer:
+{{sub_answer}}
+""".strip()
+
+
+SUB_QUESTION_ANSWER_TEMPLATE_REFINED = f"""
+Sub-Question: Q{{sub_question_num}}
+Type: {{sub_question_type}}
+Sub-Question:
+{SEPARATOR_LINE}
+{{sub_question}}
+{SEPARATOR_LINE}
+Answer:
+{SEPARATOR_LINE}
+{{sub_answer}}
+{SEPARATOR_LINE}
+""".strip()
+
+
+# Step/Utility Prompts
+# Note this one should always be used with the ENTITY_TERM_EXTRACTION_PROMPT_JSON_EXAMPLE
+ENTITY_TERM_EXTRACTION_PROMPT = f"""
+Based on the original question and some context retrieved from a dataset, please generate a list of
+entities (e.g. companies, organizations, industries, products, locations, etc.), terms and concepts
+(e.g. sales, revenue, etc.) that are relevant for the question, plus their relations to each other.
+
+Here is the original question:
+{SEPARATOR_LINE}
+{{question}}
+{SEPARATOR_LINE}
+
+And here is the context retrieved:
+{SEPARATOR_LINE}
+{{context}}
+{SEPARATOR_LINE}
+
+Please format your answer as a json object in the following format:
+""".lstrip()
+
+ENTITY_TERM_EXTRACTION_PROMPT_JSON_EXAMPLE = """
+{
+    "retrieved_entities_relationships": {
+        "entities": [
+            {
+                "entity_name": "<assign a name for the entity>",
+                "entity_type": "<specify a short type name for the entity, such as 'company', 'location',...>"
+            }
+        ],
+        "relationships": [
+            {
+                "relationship_name": "<assign a name for the relationship>",
+                "relationship_type": "<specify a short type name for the relationship, such as 'sales_to', 'is_location_of',...>",
+                "relationship_entities": ["<related entity name 1>", "<related entity name 2>", "..."]
+            }
+        ],
+        "terms": [
+            {
+                "term_name": "<assign a name for the term>",
+                "term_type": "<specify a short type name for the term, such as 'revenue', 'market_share',...>",
+                "term_similar_to": ["<list terms that are similar to this term>"]
+            }
+        ]
+    }
+}
+""".strip()
+
+
+HISTORY_CONTEXT_SUMMARY_PROMPT = (
+    "{persona_specification}\n\n"
+    "Your task now is to summarize the key parts of the history of a conversation between a user and an agent."
+    " The summary has two purposes:\n"
+    "  1) providing the suitable context for a new question, and\n"
+    "  2) To capture the key information that was discussed and that the user may have a follow-up question about.\n\n"
+    "Here is the question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here is the history:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{history}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please provide a summarized context from the history so that the question makes sense and can"
+    " - with suitable extra information - be answered.\n\n"
+    "Do not use more than three or four sentences.\n\n"
+    "History summary:"
+).strip()
+
+
+# INITIAL PHASE
+# Sub-question
+# Intentionally left a copy in case we want to modify this one differently
+INITIAL_QUESTION_DECOMPOSITION_PROMPT = (
+    "Decompose the initial user question into no more than 3 appropriate sub-questions that help to answer the"
+    " original question. The purpose for this decomposition may be to:\n"
+    "  1) isolate individual entities (i.e., 'compare sales of company A and company B' ->"
+    " ['what are sales for company A', 'what are sales for company B'])\n"
+    "  2) clarify or disambiguate ambiguous terms (i.e., 'what is our success with company A' ->"
+    " ['what are our sales with company A','what is our market share with company A',"
+    " 'is company A a reference customer for us', etc.])\n"
+    "  3) if a term or a metric is essentially clear, but it could relate to various components of an entity and you"
+    " are generally familiar with the entity, then you can decompose the question into sub-questions that are more"
+    " specific to components (i.e., 'what do we do to improve scalability of product X', 'what do we to to improve"
+    " scalability of product X', 'what do we do to improve stability of product X', ...])\n"
+    "  4) research an area that could really help to answer the question.\n\n"
+    "Here is the initial question to decompose:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "{history}\n\n"
+    "Do NOT include any text in your answer outside of the list of sub-questions!"
+    "Please formulate your answer as a newline-separated list of questions like so:\n"
+    " <sub-question>\n"
+    " <sub-question>\n"
+    " <sub-question>\n"
+    " ...\n\n"
+    "Answer:"
+).strip()
+
+
+# TODO: combine shared pieces with INITIAL_QUESTION_DECOMPOSITION_PROMPT
+INITIAL_DECOMPOSITION_PROMPT_QUESTIONS_AFTER_SEARCH = (
+    "Decompose the initial user question into no more than 3 appropriate sub-questions that help to answer the"
+    " original question. The purpose for this decomposition may be to:\n"
+    "  1) isolate individual entities (i.e., 'compare sales of company A and company B' ->"
+    " ['what are sales for company A', 'what are sales for company B'])\n"
+    "  2) clarify or disambiguate ambiguous terms (i.e., 'what is our success with company A' ->"
+    " ['what are our sales with company A','what is our market share with company A',"
+    " 'is company A a reference customer for us', etc.])\n"
+    "  3) if a term or a metric is essentially clear, but it could relate to various components of an entity and you"
+    " are generally familiar with the entity, then you can decompose the question into sub-questions that are more"
+    " specific to components (i.e., 'what do we do to improve scalability of product X', 'what do we to to improve"
+    " scalability of product X', 'what do we do to improve stability of product X', ...])\n"
+    "  4) research an area that could really help to answer the question.\n\n"
+    "To give you some context, you will see below also some documents that may relate to the question. Please only"
+    " use this information to learn what the question is approximately asking about, but do not focus on the details"
+    " to construct the sub-questions! Also, some of the entities, relationships and terms that are in the dataset may"
+    " not be in these few documents, so DO NOT focussed too much on the documents when constructing the sub-questions!"
+    " Decomposition and disambiguations are most important!\n\n"
+    "Here are the sample docs to give you some context:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{sample_doc_str}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here is the initial question to decompose:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "{history}\n\n"
+    "Do NOT include any text in your answer outside of the list of sub-questions!"
+    "Please formulate your answer as a newline-separated list of questions like so:\n"
+    " <sub-question>\n"
+    " <sub-question>\n"
+    " <sub-question>\n"
+    " ...\n\n"
+    "Answer:"
+).strip()
+
+
+# Retrieval
+QUERY_REWRITING_PROMPT = (
+    "Please convert the initial user question into a 2-3 more appropriate short and pointed search queries for"
+    " retrieval from a document store. Particularly, try to think about resolving ambiguities and make the search"
+    " queries more specific, enabling the system to search more broadly.\n"
+    "Also, try to make the search queries not redundant, i.e. not too similar!\n\n"
+    "Here is the initial question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Do NOT include any text in your answer outside of the list of queries!"
+    "Formulate the queries separated by newlines (Do not say 'Query 1: ...', just write the querytext) as follows:\n"
+    "<query 1>\n"
+    "<query 2>\n"
+    "...\n\n"
+    "Queries:"
+)
+
+
+DOCUMENT_VERIFICATION_PROMPT = (
+    "Determine whether the following document text contains data or information that is potentially relevant "
+    "for a question. It does not have to be fully relevant, but check whether it has some information that "
+    "would help - possibly in conjunction with other documents - to address the question.\n\n"
+    "Be careful that you do not use a document where you are not sure whether the text applies to the objects "
+    "or entities that are relevant for the question. For example, a book about chess could have long passage "
+    "discussing the psychology of chess without - within the passage - mentioning chess. If now a question "
+    "is asked about the psychology of football, one could be tempted to use the document as it does discuss "
+    "psychology in sports. However, it is NOT about football and should not be deemed relevant. Please "
+    "consider this logic.\n\n"
+    "DOCUMENT TEXT:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{document_content}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Do you think that this document text is useful and relevant to answer the following question?\n\n"
+    "QUESTION:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please answer with exactly and only a 'yes' or 'no'. Do NOT include any other text in your response:\n\n"
+    "Answer:"
+).strip()
+
+
+# Sub-Question Anser Generation
+SUB_QUESTION_RAG_PROMPT = (
+    "Use the context provided below - and only the provided context - to answer the given question. "
+    "(Note that the answer is in service of answering a broader question, given below as 'motivation'.)\n\n"
+    "Again, only use the provided context and do not use your internal knowledge! If you cannot answer the "
+    f'question based on the context, say "{UNKNOWN_ANSWER}". It is a matter of life and death that you do NOT '
+    "use your internal knowledge, just the provided information!\n\n"
+    "Make sure that you keep all relevant information, specifically as it concerns to the ultimate goal. "
+    "(But keep other details as well.)\n\n"
+    "It is critical that you provide inline citations in the format [D1], [D2], [D3], etc! "
+    "It is important that the citation is close to the information it supports. "
+    "Proper citations are very important to the user!\n\n"
+    "For your general information, here is the ultimate motivation:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{original_question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here is the actual question I want you to answer based on the context above (with the motivation in mind):\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Here is the context:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{context}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please keep your answer brief and concise, and focus on facts and data.\n\n"
+    "Answer:"
+).strip()
+
+
+SUB_ANSWER_CHECK_PROMPT = (
+    "Determine whether the given answer addresses the given question. "
+    "Please do not use any internal knowledge you may have - just focus on whether the answer "
+    "as given seems to largely address the question as given, or at least addresses part of the question.\n\n"
+    "Here is the question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Here is the suggested answer:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{base_answer}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    f'Does the suggested answer address the question? Please answer with "{YES}" or "{NO}".'
+).strip()
+
+
+# Initial Answer Generation
+INITIAL_ANSWER_PROMPT_W_SUB_QUESTIONS = (
+    "{persona_specification}\n\n"
+    "Use the information provided below - and only the provided information - to answer the provided main question.\n\n"
+    "The information provided below consists of:\n"
+    "  1) a number of answered sub-questions - these are very important to help you organize your thoughts and your answer\n"
+    "  2) a number of documents that deemed relevant for the question.\n\n"
+    "{history}\n\n"
+    "It is critical that you provide prover inline citations to documents in the format [D1], [D2], [D3], etc.!\n"
+    "It is important that the citation is close to the information it supports. If you have multiple citations that support\n"
+    "a fact, please cite for example as [D1][D3], or [D2][D4], etc.\n"
+    "Feel free to also cite sub-questions in addition to documents, but make sure that you have documents cited with the "
+    "sub-question citation. If you want to cite both a document and a sub-question, please use [D1][Q3], or "
+    "[D2][D7][Q4], etc.\n"
+    "Again, please NEVER cite sub-questions without a document citation! "
+    "Proper citations are very important for the user!\n\n"
+    "IMPORTANT RULES:\n"
+    " - If you cannot reliably answer the question solely using the provided information, say that you cannot reliably answer.\n"
+    " You may give some additional facts you learned, but do not try to invent an answer.\n"
+    f' - If the information is empty or irrelevant, just say "{UNKNOWN_ANSWER}".\n'
+    " - If the information is relevant but not fully conclusive, specify that the information is not conclusive and say why.\n\n"
+    "Again, you should be sure that the answer is supported by the information provided!\n\n"
+    "Try to keep your answer concise. But also highlight uncertainties you may have should there be substantial ones,\n"
+    "or assumptions you made.\n\n"
+    "Here is the contextual information:\n"
+    f"{SEPARATOR_LINE_LONG}\n\n"
+    "*Answered Sub-questions (these should really matter!):\n"
+    f"{SEPARATOR_LINE}\n"
+    "{answered_sub_questions}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here are relevant document information that support the sub-question answers, "
+    "or that are relevant for the actual question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{relevant_docs}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here is the question I want you to answer based on the information above:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please keep your answer brief and concise, and focus on facts and data.\n\n"
+    "Answer:"
+).strip()
+
+
+# Used if sub_question_answer_str is empty
+INITIAL_ANSWER_PROMPT_WO_SUB_QUESTIONS = (
+    "{answered_sub_questions}{persona_specification}\n\n"
+    "Use the information provided below - and only the provided information - to answer the provided question. "
+    "The information provided below consists of a number of documents that were deemed relevant for the question.\n"
+    "{history}\n\n"
+    "IMPORTANT RULES:\n"
+    " - If you cannot reliably answer the question solely using the provided information, say that you cannot reliably answer. "
+    "You may give some additional facts you learned, but do not try to invent an answer.\n"
+    f' - If the information is irrelevant, just say "{UNKNOWN_ANSWER}".\n'
+    " - If the information is relevant but not fully conclusive, specify that the information is not conclusive and say why.\n\n"
+    "Again, you should be sure that the answer is supported by the information provided!\n\n"
+    "It is critical that you provide proper inline citations to documents in the format [D1], [D2], [D3], etc! "
+    "It is important that the citation is close to the information it supports. If you have multiple citations, "
+    "please cite for example as [D1][D3], or [D2][D4], etc. Citations are very important for the user!\n\n"
+    "Here is the relevant context information:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{relevant_docs}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here is the question I want you to answer based on the context above:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please keep your answer brief and concise, and focus on facts and data.\n\n"
+    "Answer:"
+).strip()
+
+
+# REFINEMENT PHASE
+REFINEMENT_QUESTION_DECOMPOSITION_PROMPT = (
+    "An initial user question needs to be answered. An initial answer has been provided but it wasn't quite "
+    "good enough. Also, some sub-questions had been answered and this information has been used to provide "
+    "the initial answer. Some other subquestions may have been suggested based on little knowledge, but they "
+    "were not directly answerable. Also, some entities, relationships and terms are given to you so that "
+    "you have an idea of how the available data looks like.\n\n"
+    "Your role is to generate 2-4 new sub-questions that would help to answer the initial question, considering:\n\n"
+    "1) The initial question\n"
+    "2) The initial answer that was found to be unsatisfactory\n"
+    "3) The sub-questions that were answered\n"
+    "4) The sub-questions that were suggested but not answered\n"
+    "5) The entities, relationships and terms that were extracted from the context\n\n"
+    "The individual questions should be answerable by a good RAG system. "
+    "So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the "
+    "question for different entities that may be involved in the original question, but in a way that does "
+    "not duplicate questions that were already tried.\n\n"
+    "Additional Guidelines:\n"
+    "- The sub-questions should be specific to the question and provide richer context for the question, "
+    "resolve ambiguities, or address shortcoming of the initial answer\n"
+    "- Each sub-question - when answered - should be relevant for the answer to the original question\n"
+    "- The sub-questions should be free from comparisons, ambiguities,judgements, aggregations, or any "
+    "other complications that may require extra context.\n"
+    "- The sub-questions MUST have the full context of the original question so that it can be executed by "
+    "a RAG system independently without the original question available\n"
+    "    (Example:\n"
+    '    - initial question: "What is the capital of France?"\n'
+    '    - bad sub-question: "What is the name of the river there?"\n'
+    '    - good sub-question: "What is the name of the river that flows through Paris?")\n'
+    "- For each sub-question, please also provide a search term that can be used to retrieve relevant "
+    "documents from a document store.\n"
+    "- Consider specifically the sub-questions that were suggested but not answered. This is a sign that they are not "
+    "answerable with the available context, and you should not ask similar questions.\n\n"
+    "Here is the initial question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n"
+    "{history}\n\n"
+    "Here is the initial sub-optimal answer:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{base_answer}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Here are the sub-questions that were answered:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{answered_sub_questions}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Here are the sub-questions that were suggested but not answered:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{failed_sub_questions}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here are the entities, relationships and terms extracted from the context:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{entity_term_extraction_str}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please generate the list of good, fully contextualized sub-questions that would help to address the main question.\n"
+    "Specifically pay attention also to the entities, relationships and terms extracted, as these indicate what type of "
+    "objects/relationships/terms you can ask about! Do not ask about entities, terms or relationships that are not "
+    "mentioned in the 'entities, relationships and terms' section.\n\n"
+    "Again, please find questions that are NOT overlapping too much with the already answered "
+    "sub-questions or those that already were suggested and failed.\n"
+    "In other words - what can we try in addition to what has been tried so far?\n\n"
+    "Generate the list of questions separated by one new line like this:\n"
+    "<sub-question 1>\n"
+    "<sub-question 2>\n"
+    "<sub-question 3>\n"
+    "..."
+).strip()
+
+
+REFINED_ANSWER_PROMPT_W_SUB_QUESTIONS = (
+    "{persona_specification}\n\n"
+    "Your task is to improve on a given answer to a question, as the initial answer was found to be lacking in some way.\n\n"
+    "Use the information provided below - and only the provided information - to write your new and improved answer.\n\n"
+    "The information provided below consists of:\n"
+    "  1) an initial answer that was given but found to be lacking in some way.\n"
+    "  2) a number of answered sub-questions - these are very important(!) and definitely should help you to answer "
+    "the main question. Note that the sub-questions have a type, 'initial' and 'refined'. The 'initial' "
+    "ones were available for the creation of the initial answer, but the 'refined' were not, they are new. So please use "
+    "the 'refined' sub-questions in particular to update/extend/correct/enrich the initial answer and to add "
+    "more details/new facts!\n"
+    "  3) a number of documents that were deemed relevant for the question. This is the context that you use largely for "
+    "citations (see below). So consider the answers to the sub-questions as guidelines to construct your new answer, but "
+    "make sure you cite the relevant document for a fact!\n\n"
+    "It is critical that you provide proper inline citations to documents in the format [D1], [D2], [D3], etc! "
+    "It is important that the citation is close to the information it supports. "
+    "DO NOT just list all of the citations at the very end. "
+    "Feel free to also cite sub-questions in addition to documents, but make sure that you have documents cited with the "
+    "sub-question citation. If you want to cite both a document and a sub-question, please use [D1][Q3], or [D2][D7][Q4], etc. "
+    "and always place the document citation before the sub-question citation. "
+    "Again, please NEVER cite sub-questions without a document citation!\n"
+    "Proper citations are very important for the user!\n\n"
+    "{history}\n\n"
+    "IMPORTANT RULES:\n"
+    " - If you cannot reliably answer the question solely using the provided information, say that you cannot reliably answer. "
+    "You may give some additional facts you learned, but do not try to invent an answer.\n"
+    f' - If the information is empty or irrelevant, just say "{UNKNOWN_ANSWER}".\n'
+    " - If the information is relevant but not fully conclusive, provide an answer to the extent you can but also "
+    "specify that the information is not conclusive and why.\n"
+    " - Ignore any existing citations within the answered sub-questions, like [D1]... and [Q2]! "
+    "The citations you will need to use will need to refer to the documents (and sub-questions) that you are explicitly "
+    "presented with below!\n\n"
+    "Again, you should be sure that the answer is supported by the information provided!\n\n"
+    "Try to keep your answer concise. But also highlight uncertainties you may have should there be substantial ones, "
+    "or assumptions you made.\n\n"
+    "Here is the contextual information:\n"
+    f"{SEPARATOR_LINE_LONG}\n\n"
+    "*Initial Answer that was found to be lacking:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{initial_answer}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "*Answered Sub-questions (these should really help you to research your answer! They also contain questions/answers "
+    "that were not available when the original answer was constructed):\n"
+    "{answered_sub_questions}\n\n"
+    "And here are the relevant documents that support the sub-question answers, and that are relevant for the actual question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{relevant_docs}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Lastly, here is the main question I want you to answer based on the information above:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please keep your answer brief and concise, and focus on facts and data.\n\n"
+    "Answer:"
+).strip()
+
+# sub_question_answer_str is empty
+REFINED_ANSWER_PROMPT_WO_SUB_QUESTIONS = (
+    "{answered_sub_questions}{persona_specification}\n\n"
+    "Use the information provided below - and only the provided information - to answer the provided question.\n\n"
+    "The information provided below consists of:\n"
+    "  1) an initial answer that was given but found to be lacking in some way.\n"
+    "  2) a number of documents that were also deemed relevant for the question.\n\n"
+    "It is critical that you provide proper inline citations to documents in the format [D1], [D2], [D3], etc! "
+    "It is important that the citation is close to the information it supports. "
+    "DO NOT just list all of the citations at the very end of your response. Citations are very important for the user!\n\n"
+    "{history}\n\n"
+    "IMPORTANT RULES:\n"
+    " - If you cannot reliably answer the question solely using the provided information, say that you cannot reliably answer. "
+    "You may give some additional facts you learned, but do not try to invent an answer.\n"
+    f' - If the information is empty or irrelevant, just say "{UNKNOWN_ANSWER}".\n'
+    " - If the information is relevant but not fully conclusive, provide an answer to the extent you can but also "
+    "specify that the information is not conclusive and why.\n\n"
+    "Again, you should be sure that the answer is supported by the information provided!\n\n"
+    "Try to keep your answer concise. But also highlight uncertainties you may have should there be substantial ones, "
+    "or assumptions you made.\n\n"
+    "Here is the contextual information:\n"
+    f"{SEPARATOR_LINE_LONG}\n\n"
+    "*Initial Answer that was found to be lacking:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{initial_answer}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "And here are relevant document information that support the sub-question answers, "
+    "or that are relevant for the actual question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{relevant_docs}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Lastly, here is the question I want you to answer based on the information above:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Please keep your answer brief and concise, and focus on facts and data.\n\n"
+    "Answer:"
+).strip()
+
+
+INITIAL_REFINED_ANSWER_COMPARISON_PROMPT = (
+    "For the given question, please compare the initial answer and the refined answer and determine if "
+    "the refined answer is substantially better than the initial answer, not just a bit better. Better could mean:\n"
+    " - additional information\n"
+    " - more comprehensive information\n"
+    " - more concise information\n"
+    " - more structured information\n"
+    " - more details\n"
+    " - new bullet points\n"
+    " - substantially more document citations ([D1], [D2], [D3], etc.)\n\n"
+    "Put yourself in the shoes of the user and think about whether the refined answer is really substantially "
+    "better and delivers really new insights than the initial answer.\n\n"
+    "Here is the question:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{question}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Here is the initial answer:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{initial_answer}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "Here is the refined answer:\n"
+    f"{SEPARATOR_LINE}\n"
+    "{refined_answer}\n"
+    f"{SEPARATOR_LINE}\n\n"
+    "With these criteria in mind, is the refined answer substantially better than the initial answer?\n\n"
+    f'Please answer with a simple "{YES}" or "{NO}".'
+).strip()
--- a/backend/onyx/secondary_llm_flows/choose_search.py
+++ b/backend/onyx/secondary_llm_flows/choose_search.py
@@ -81,7 +81,4 @@ def check_if_need_search(

    logger.debug(f"Run search prediction: {require_search_output}")

-    if (SKIP_SEARCH.split()[0]).lower() in require_search_output.lower():
-        return False
-
-    return True
+    return (SKIP_SEARCH.split()[0]).lower() not in require_search_output.lower()
--- a/backend/onyx/server/features/persona/models.py
+++ b/backend/onyx/server/features/persona/models.py
@@ -112,7 +112,7 @@ class PersonaSnapshot(BaseModel):
    uploaded_image_id: str | None = None
    is_default_persona: bool
    search_start_date: datetime | None = None
-    labels: list["PersonaLabelSnapshot"]
+    labels: list["PersonaLabelSnapshot"] = []

    @classmethod
    def from_model(
--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@@ -213,6 +213,8 @@ def get_chat_session(
        # we need the tool call objs anyways, so just fetch them in a single call
        prefetch_tool_calls=True,
    )
+    for message in session_messages:
+        translate_db_message_to_chat_message_detail(message)

    return ChatSessionDetailResponse(
        chat_session_id=session_id,
@@ -352,10 +354,12 @@ async def is_connected(request: Request) -> Callable[[], bool]:
    def is_connected_sync() -> bool:
        future = asyncio.run_coroutine_threadsafe(request.is_disconnected(), main_loop)
        try:
-            is_connected = not future.result(timeout=0.01)
+            is_connected = not future.result(timeout=0.05)
            return is_connected
        except asyncio.TimeoutError:
-            logger.error("Asyncio timed out")
+            logger.warning(
+                "Asyncio timed out (potentially missed request to stop streaming)"
+            )
            return True
        except Exception as e:
            error_msg = str(e)
@@ -416,6 +420,10 @@ def handle_new_chat_message(

    def stream_generator() -> Generator[str, None, None]:
        try:
+            import time
+
+            start_time = time.time()
+            n = 0
            for packet in stream_chat_message(
                new_msg_req=chat_message_req,
                user=user,
@@ -427,6 +435,19 @@ def handle_new_chat_message(
                ),
                is_connected=is_connected_func,
            ):
+                if "top_documents" in packet:
+                    to_first_docs = time.time() - start_time
+                    print(f"Time to first docs: {to_first_docs}")
+                    print(packet)
+                elif "answer_piece" in packet:
+                    to_answer_piece = time.time() - start_time
+                    if n == 1:
+                        print(f"Time to answer piece: {to_answer_piece}")
+                        print(packet)
+                    n += 1
+
+                # time_since_start = time.time() - start_time
+                # print(f"Time since start: {time_since_start}")
                yield json.dumps(packet) if isinstance(packet, dict) else packet

        except Exception as e:
--- a/backend/onyx/server/query_and_chat/models.py
+++ b/backend/onyx/server/query_and_chat/models.py
@@ -134,6 +134,12 @@ class CreateChatMessageRequest(ChunkContext):
    # https://platform.openai.com/docs/guides/structured-outputs/introduction
    structured_response_format: dict | None = None

+    # If true, ignores most of the search options and uses pro search instead.
+    # TODO: decide how many of the above options we want to pass through to pro search
+    use_agentic_search: bool = False
+
+    skip_gen_ai_answer_generation: bool = False
+
    @model_validator(mode="after")
    def check_search_doc_ids_or_retrieval_options(self) -> "CreateChatMessageRequest":
        if self.search_doc_ids is None and self.retrieval_options is None:
@@ -200,6 +206,22 @@ class SearchFeedbackRequest(BaseModel):
        return self


+class SubQueryDetail(BaseModel):
+    query: str
+    query_id: int
+    # TODO: store these to enable per-query doc selection
+    doc_ids: list[int] | None = None
+
+
+class SubQuestionDetail(BaseModel):
+    level: int
+    level_question_num: int
+    question: str
+    answer: str
+    sub_queries: list[SubQueryDetail] | None = None
+    context_docs: RetrievalDocs | None = None
+
+
 class ChatMessageDetail(BaseModel):
    message_id: int
    parent_message: int | None = None
@@ -211,11 +233,13 @@ class ChatMessageDetail(BaseModel):
    time_sent: datetime
    overridden_model: str | None
    alternate_assistant_id: int | None = None
-    # Dict mapping citation number to db_doc_id
    chat_session_id: UUID | None = None
+    # Dict mapping citation number to db_doc_id
    citations: dict[int, int] | None = None
+    sub_questions: list[SubQuestionDetail] | None = None
    files: list[FileDescriptor]
    tool_call: ToolCallFinalResult | None
+    refined_answer_improvement: bool | None = None

    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
        initial_dict = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
--- a/backend/onyx/server/settings/api.py
+++ b/backend/onyx/server/settings/api.py
@@ -10,7 +10,6 @@ from onyx.auth.users import current_user
 from onyx.auth.users import is_user_admin
 from onyx.configs.constants import KV_REINDEX_KEY
 from onyx.configs.constants import NotificationType
-from onyx.db.engine import get_current_tenant_id
 from onyx.db.engine import get_session
 from onyx.db.models import User
 from onyx.db.notification import create_notification
@@ -43,7 +42,6 @@ def put_settings(
 def fetch_settings(
    user: User | None = Depends(current_user),
    db_session: Session = Depends(get_session),
-    tenant_id: str | None = Depends(get_current_tenant_id),
 ) -> UserSettings:
    """Settings and notifications are stuffed into this single endpoint to reduce number of
    Postgres calls"""
--- a/backend/onyx/server/settings/models.py
+++ b/backend/onyx/server/settings/models.py
@@ -45,6 +45,8 @@ class Settings(BaseModel):
    gpu_enabled: bool | None = None
    product_gating: GatingType = GatingType.NONE
    anonymous_user_enabled: bool | None = None
+    pro_search_disabled: bool | None = None
+    auto_scroll: bool | None = None


 class UserSettings(Settings):
--- a/backend/onyx/server/settings/store.py
+++ b/backend/onyx/server/settings/store.py
@@ -11,6 +11,16 @@ logger = setup_logger()


 def load_settings() -> Settings:
+    kv_store = get_kv_store()
+    try:
+        stored_settings = kv_store.load(KV_SETTINGS_KEY)
+        settings = (
+            Settings.model_validate(stored_settings) if stored_settings else Settings()
+        )
+    except Exception as e:
+        logger.error(f"Error loading settings from KV store: {str(e)}")
+        settings = Settings()
+
    tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() if MULTI_TENANT else None
    redis_client = get_redis_client(tenant_id=tenant_id)

@@ -26,10 +36,10 @@ def load_settings() -> Settings:
            redis_client.set(OnyxRedisLocks.ANONYMOUS_USER_ENABLED, "0")
    except Exception as e:
        # Log the error and reset to default
-        logger.error(f"Error loading settings from Redis: {str(e)}")
+        logger.error(f"Error loading anonymous user setting from Redis: {str(e)}")
        anonymous_user_enabled = False

-    settings = Settings(anonymous_user_enabled=anonymous_user_enabled)
+    settings.anonymous_user_enabled = anonymous_user_enabled
    return settings


--- a/backend/onyx/setup.py
+++ b/backend/onyx/setup.py
@@ -11,7 +11,9 @@ from onyx.configs.model_configs import FAST_GEN_AI_MODEL_VERSION
 from onyx.configs.model_configs import GEN_AI_API_KEY
 from onyx.configs.model_configs import GEN_AI_MODEL_VERSION
 from onyx.context.search.models import SavedSearchSettings
-from onyx.context.search.retrieval.search_runner import download_nltk_data
+from onyx.context.search.retrieval.search_runner import (
+    download_nltk_data,
+)
 from onyx.db.connector import check_connectors_exist
 from onyx.db.connector import create_initial_default_connector
 from onyx.db.connector_credential_pair import associate_default_cc_pair
--- a/backend/onyx/tools/message.py
+++ b/backend/onyx/tools/message.py
@@ -4,7 +4,7 @@ from typing import Any
 from langchain_core.messages.ai import AIMessage
 from langchain_core.messages.tool import ToolCall
 from langchain_core.messages.tool import ToolMessage
-from pydantic.v1 import BaseModel as BaseModel__v1
+from pydantic import BaseModel

 from onyx.natural_language_processing.utils import BaseTokenizer

@@ -21,10 +21,16 @@ def build_tool_message(
    )


-class ToolCallSummary(BaseModel__v1):
+# TODO: does this NEED to be BaseModel__v1?
+class ToolCallSummary(BaseModel):
    tool_call_request: AIMessage
    tool_call_result: ToolMessage

+    # This is a workaround to allow arbitrary types in the model
+    # TODO: Remove this once we have a better solution
+    class Config:
+        arbitrary_types_allowed = True
+

 def tool_call_tokens(
    tool_call_summary: ToolCallSummary, llm_tokenizer: BaseTokenizer
--- a/backend/onyx/tools/models.py
+++ b/backend/onyx/tools/models.py
@@ -4,6 +4,9 @@ from uuid import UUID
 from pydantic import BaseModel
 from pydantic import model_validator

+from onyx.context.search.enums import SearchType
+from onyx.context.search.models import IndexFilters
+

 class ToolResponse(BaseModel):
    id: str | None = None
@@ -38,6 +41,9 @@ class ToolCallFinalResult(ToolCallKickoff):
    tool_result: Any = (
        None  # we would like to use JSON_ro, but can't due to its recursive nature
    )
+    # agentic additions; only need to set during agentic tool calls
+    level: int | None = None
+    level_question_num: int | None = None


 class DynamicSchemaInfo(BaseModel):
@@ -45,5 +51,11 @@ class DynamicSchemaInfo(BaseModel):
    message_id: int | None


+class SearchQueryInfo(BaseModel):
+    predicted_search: SearchType | None
+    final_filters: IndexFilters
+    recency_bias_multiplier: float
+
+
 CHAT_SESSION_ID_PLACEHOLDER = "CHAT_SESSION_ID"
 MESSAGE_ID_PLACEHOLDER = "MESSAGE_ID"
--- a/backend/onyx/tools/tool_implementations/search/search_tool.py
+++ b/backend/onyx/tools/tool_implementations/search/search_tool.py
@@ -1,9 +1,9 @@
 import json
+from collections.abc import Callable
 from collections.abc import Generator
 from typing import Any
 from typing import cast

-from pydantic import BaseModel
 from sqlalchemy.orm import Session

 from onyx.chat.chat_utils import llm_doc_from_inference_section
@@ -25,13 +25,13 @@ from onyx.configs.chat_configs import CONTEXT_CHUNKS_BELOW
 from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
 from onyx.context.search.enums import LLMEvaluationType
 from onyx.context.search.enums import QueryFlow
-from onyx.context.search.enums import SearchType
 from onyx.context.search.models import IndexFilters
 from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import RerankingDetails
 from onyx.context.search.models import RetrievalDetails
 from onyx.context.search.models import SearchRequest
 from onyx.context.search.pipeline import SearchPipeline
+from onyx.context.search.pipeline import section_relevance_list_impl
 from onyx.db.models import Persona
 from onyx.db.models import User
 from onyx.llm.interfaces import LLM
@@ -39,6 +39,7 @@ from onyx.llm.models import PreviousMessage
 from onyx.secondary_llm_flows.choose_search import check_if_need_search
 from onyx.secondary_llm_flows.query_expansion import history_based_query_rephrase
 from onyx.tools.message import ToolCallSummary
+from onyx.tools.models import SearchQueryInfo
 from onyx.tools.models import ToolResponse
 from onyx.tools.tool import Tool
 from onyx.tools.tool_implementations.search.search_utils import llm_doc_to_dict
@@ -48,9 +49,6 @@ from onyx.tools.tool_implementations.search_like_tool_utils import (
 from onyx.tools.tool_implementations.search_like_tool_utils import (
    FINAL_CONTEXT_DOCUMENTS_ID,
 )
-from onyx.tools.tool_implementations.search_like_tool_utils import (
-    ORIGINAL_CONTEXT_DOCUMENTS_ID,
-)
 from onyx.utils.logger import setup_logger
 from onyx.utils.special_types import JSON_ro

@@ -62,13 +60,10 @@ SECTION_RELEVANCE_LIST_ID = "section_relevance_list"
 SEARCH_EVALUATION_ID = "llm_doc_eval"


-class SearchResponseSummary(BaseModel):
+class SearchResponseSummary(SearchQueryInfo):
    top_sections: list[InferenceSection]
    rephrased_query: str | None = None
    predicted_flow: QueryFlow | None
-    predicted_search: SearchType | None
-    final_filters: IndexFilters
-    recency_bias_multiplier: float


 SEARCH_TOOL_DESCRIPTION = """
@@ -281,8 +276,14 @@ class SearchTool(Tool):

        yield ToolResponse(id=FINAL_CONTEXT_DOCUMENTS_ID, response=llm_docs)

-    def run(self, **kwargs: str) -> Generator[ToolResponse, None, None]:
+    def run(self, **kwargs: Any) -> Generator[ToolResponse, None, None]:
        query = cast(str, kwargs["query"])
+        force_no_rerank = cast(bool, kwargs.get("force_no_rerank", False))
+        alternate_db_session = cast(Session, kwargs.get("alternate_db_session", None))
+        retrieved_sections_callback = cast(
+            Callable[[list[InferenceSection]], None],
+            kwargs.get("retrieved_sections_callback"),
+        )

        if self.selected_sections:
            yield from self._build_response_for_specified_sections(query)
@@ -291,7 +292,9 @@ class SearchTool(Tool):
        search_pipeline = SearchPipeline(
            search_request=SearchRequest(
                query=query,
-                evaluation_type=self.evaluation_type,
+                evaluation_type=LLMEvaluationType.SKIP
+                if force_no_rerank
+                else self.evaluation_type,
                human_selected_filters=(
                    self.retrieval_options.filters if self.retrieval_options else None
                ),
@@ -300,7 +303,16 @@ class SearchTool(Tool):
                    self.retrieval_options.offset if self.retrieval_options else None
                ),
                limit=self.retrieval_options.limit if self.retrieval_options else None,
-                rerank_settings=self.rerank_settings,
+                rerank_settings=RerankingDetails(
+                    rerank_model_name=None,
+                    rerank_api_url=None,
+                    rerank_provider_type=None,
+                    rerank_api_key=None,
+                    num_rerank=0,
+                    disable_rerank_for_streaming=True,
+                )
+                if force_no_rerank
+                else self.rerank_settings,
                chunks_above=self.chunks_above,
                chunks_below=self.chunks_below,
                full_doc=self.full_doc,
@@ -314,57 +326,25 @@ class SearchTool(Tool):
            llm=self.llm,
            fast_llm=self.fast_llm,
            bypass_acl=self.bypass_acl,
-            db_session=self.db_session,
+            db_session=alternate_db_session or self.db_session,
            prompt_config=self.prompt_config,
+            retrieved_sections_callback=retrieved_sections_callback,
        )

-        yield ToolResponse(
-            id=SEARCH_RESPONSE_SUMMARY_ID,
-            response=SearchResponseSummary(
-                rephrased_query=query,
-                top_sections=search_pipeline.final_context_sections,
-                predicted_flow=search_pipeline.predicted_flow,
-                predicted_search=search_pipeline.predicted_search_type,
-                final_filters=search_pipeline.search_query.filters,
-                recency_bias_multiplier=search_pipeline.search_query.recency_bias_multiplier,
-            ),
+        search_query_info = SearchQueryInfo(
+            predicted_search=search_pipeline.search_query.search_type,
+            final_filters=search_pipeline.search_query.filters,
+            recency_bias_multiplier=search_pipeline.search_query.recency_bias_multiplier,
        )
-
-        yield ToolResponse(
-            id=SEARCH_DOC_CONTENT_ID,
-            response=OnyxContexts(
-                contexts=[
-                    OnyxContext(
-                        content=section.combined_content,
-                        document_id=section.center_chunk.document_id,
-                        semantic_identifier=section.center_chunk.semantic_identifier,
-                        blurb=section.center_chunk.blurb,
-                    )
-                    for section in search_pipeline.reranked_sections
-                ]
-            ),
+        yield from yield_search_responses(
+            query,
+            search_pipeline.reranked_sections,
+            search_pipeline.final_context_sections,
+            search_query_info,
+            lambda: search_pipeline.section_relevance,
+            self,
        )

-        yield ToolResponse(
-            id=SECTION_RELEVANCE_LIST_ID,
-            response=search_pipeline.section_relevance,
-        )
-
-        pruned_sections = prune_sections(
-            sections=search_pipeline.final_context_sections,
-            section_relevance_list=search_pipeline.section_relevance_list,
-            prompt_config=self.prompt_config,
-            llm_config=self.llm.config,
-            question=query,
-            contextual_pruning_config=self.contextual_pruning_config,
-        )
-
-        llm_docs = [
-            llm_doc_from_inference_section(section) for section in pruned_sections
-        ]
-
-        yield ToolResponse(id=FINAL_CONTEXT_DOCUMENTS_ID, response=llm_docs)
-
    def final_result(self, *args: ToolResponse) -> JSON_ro:
        final_docs = cast(
            list[LlmDoc],
@@ -414,7 +394,7 @@ class SearchTool(Tool):
                final_search_results = cast(list[LlmDoc], yield_item.response)
            elif (
                isinstance(yield_item, ToolResponse)
-                and yield_item.id == ORIGINAL_CONTEXT_DOCUMENTS_ID
+                and yield_item.id == SEARCH_DOC_CONTENT_ID
            ):
                search_contexts = yield_item.response.contexts
                # original_doc_search_rank = 1
@@ -425,3 +405,64 @@ class SearchTool(Tool):
                initial_search_results = cast(list[LlmDoc], initial_search_results)

        return final_search_results, initial_search_results
+
+
+# Allows yielding the same responses as a SearchTool without being a SearchTool.
+# SearchTool passed in to allow for access to SearchTool properties.
+# We can't just call SearchTool methods in the graph because we're operating on
+# the retrieved docs (reranking, deduping, etc.) after the SearchTool has run.
+def yield_search_responses(
+    query: str,
+    reranked_sections: list[InferenceSection],
+    final_context_sections: list[InferenceSection],
+    search_query_info: SearchQueryInfo,
+    get_section_relevance: Callable[[], list[SectionRelevancePiece] | None],
+    search_tool: SearchTool,
+) -> Generator[ToolResponse, None, None]:
+    yield ToolResponse(
+        id=SEARCH_RESPONSE_SUMMARY_ID,
+        response=SearchResponseSummary(
+            rephrased_query=query,
+            top_sections=final_context_sections,
+            predicted_flow=QueryFlow.QUESTION_ANSWER,
+            predicted_search=search_query_info.predicted_search,
+            final_filters=search_query_info.final_filters,
+            recency_bias_multiplier=search_query_info.recency_bias_multiplier,
+        ),
+    )
+
+    yield ToolResponse(
+        id=SEARCH_DOC_CONTENT_ID,
+        response=OnyxContexts(
+            contexts=[
+                OnyxContext(
+                    content=section.combined_content,
+                    document_id=section.center_chunk.document_id,
+                    semantic_identifier=section.center_chunk.semantic_identifier,
+                    blurb=section.center_chunk.blurb,
+                )
+                for section in reranked_sections
+            ]
+        ),
+    )
+
+    section_relevance = get_section_relevance()
+    yield ToolResponse(
+        id=SECTION_RELEVANCE_LIST_ID,
+        response=section_relevance,
+    )
+
+    pruned_sections = prune_sections(
+        sections=final_context_sections,
+        section_relevance_list=section_relevance_list_impl(
+            section_relevance, final_context_sections
+        ),
+        prompt_config=search_tool.prompt_config,
+        llm_config=search_tool.llm.config,
+        question=query,
+        contextual_pruning_config=search_tool.contextual_pruning_config,
+    )
+
+    llm_docs = [llm_doc_from_inference_section(section) for section in pruned_sections]
+
+    yield ToolResponse(id=FINAL_CONTEXT_DOCUMENTS_ID, response=llm_docs)
--- a/backend/onyx/tools/tool_implementations/search_like_tool_utils.py
+++ b/backend/onyx/tools/tool_implementations/search_like_tool_utils.py
@@ -15,7 +15,6 @@ from onyx.tools.message import ToolCallSummary
 from onyx.tools.models import ToolResponse


-ORIGINAL_CONTEXT_DOCUMENTS_ID = "search_doc_content"
 FINAL_CONTEXT_DOCUMENTS_ID = "final_context_documents"


--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
joachim-danswer	37ac04859c	StreamWriter w/o default values	2025-02-04 16:18:01 -08:00
Evan Lohn	f80d3bf2da	code for timing langgraph basic search	2025-02-04 13:01:06 -08:00
pablodanswer	b97819189b	push various minor updates	2025-02-03 21:23:45 -08:00
Evan Lohn	b928201397	fixed rebase issue and some cleanup	2025-02-03 20:49:45 -08:00
Yuhong Sun	b500c914b0	cleanup	2025-02-03 20:10:51 -08:00
Yuhong Sun	4b0d22fae3	prompts	2025-02-03 20:10:51 -08:00
joachim-danswer	b46c09ac6c	EL comments	2025-02-03 20:10:51 -08:00
joachim-danswer	3ce8923086	fix for citation update	2025-02-03 20:10:51 -08:00
joachim-danswer	7ac6d3ed50	logging level changes	2025-02-03 20:10:51 -08:00
joachim-danswer	3cd057d7a2	LangGraph comments	2025-02-03 20:10:51 -08:00
joachim-danswer	4834ee6223	new citation format	2025-02-03 20:10:51 -08:00
pablodanswer	cb85be41b1	add proper citation handling	2025-02-03 20:10:51 -08:00
joachim-danswer	eb227c0acc	nit update	2025-02-03 20:10:51 -08:00
joachim-danswer	25a57e2292	add title and meta-data to doc	2025-02-03 20:10:51 -08:00
pablodanswer	3f3b04a4ee	update width	2025-02-03 20:10:51 -08:00
Evan Lohn	3f6de7968a	prompt improvements for wekaer models	2025-02-03 20:10:51 -08:00
pablodanswer	024207e2d9	update	2025-02-03 20:10:51 -08:00
Yuhong Sun	8f7db9212c	k	2025-02-03 20:10:51 -08:00
pablodanswer	b1e9e03aa4	nit	2025-02-03 20:10:51 -08:00
pablodanswer	87a53d6d80	quick update	2025-02-03 20:10:51 -08:00
Yuhong Sun	59c65a4192	prompts	2025-02-03 20:10:51 -08:00
pablodanswer	c984c6c7f2	add pro search disable	2025-02-03 20:10:51 -08:00
Yuhong Sun	9a3ce504bc	beta	2025-02-03 20:10:51 -08:00
Yuhong Sun	16265d27f5	k	2025-02-03 20:10:51 -08:00
Yuhong Sun	570fe43efb	log level changes	2025-02-03 20:10:51 -08:00
Yuhong Sun	506a9f1b94	Yuhong	2025-02-03 20:10:51 -08:00
Yuhong Sun	a067b32467	Partial Prompt Updates (#3880 )	2025-02-03 20:10:51 -08:00
pablodanswer	9b6e51b4fe	k	2025-02-03 20:10:51 -08:00
joachim-danswer	e23dd0a3fa	renames + fix of refined answer generation prompt	2025-02-03 20:10:51 -08:00
Evan Lohn	71304e4228	always persist in agent search	2025-02-03 20:10:51 -08:00
Evan Lohn	2adeaaeded	loading object into model instead of json	2025-02-03 20:10:51 -08:00
Evan Lohn	a96728ff4d	prompt piece optimizations	2025-02-03 20:10:51 -08:00
pablodanswer	eaffdee0dc	broadly fixed minus some issues	2025-02-03 20:10:51 -08:00
pablodanswer	feaa3b653f	fix misc issues	2025-02-03 20:10:51 -08:00
joachim-danswer	9438f9df05	removal of sone unused states/models	2025-02-03 20:10:51 -08:00
joachim-danswer	b90e0834a5	major renaming	2025-02-03 20:10:51 -08:00
Evan Lohn	29440f5482	alembic heads, basic citations, search pipeline state	2025-02-03 20:10:51 -08:00
Evan Lohn	5a95a5c9fd	large number of PR comments addressed	2025-02-03 20:10:51 -08:00
Evan Lohn	118e8afbef	reworked config to have logical structure	2025-02-03 20:10:51 -08:00
joachim-danswer	8342168658	initial variable renaming	2025-02-03 20:10:51 -08:00
joachim-danswer	d5661baf98	history summary fix - adjusted prompt - adjusted citation removal - length cutoff by words, not characters	2025-02-03 20:10:51 -08:00
joachim-danswer	95fcc0019c	history summary update	2025-02-03 20:10:51 -08:00
joachim-danswer	0ccd83e809	deep_search_a and agent_a_config renaming	2025-02-03 20:10:51 -08:00
joachim-danswer	732861a940	rename of documents to verified_reranked_documents	2025-02-03 20:10:51 -08:00
joachim-danswer	d53dd1e356	cited_docs -> cited_documents	2025-02-03 20:10:51 -08:00
joachim-danswer	1a2760edee	improved logging through agent_state plus some default fixes	2025-02-03 20:10:51 -08:00
joachim-danswer	23ae4547ca	default values of number of strings and other things	2025-02-03 20:10:51 -08:00
Evan Lohn	385b344a43	addressed TODOs	2025-02-03 20:10:51 -08:00
Evan Lohn	a340529de3	sync streaming impl	2025-02-03 20:10:51 -08:00
joachim-danswer	4a0b2a6c09	additional naming fixes	2025-02-03 20:10:51 -08:00
joachim-danswer	756a1cbf8f	answer_refined_question_subgraphs	2025-02-03 20:10:51 -08:00
joachim-danswer	8af4f1da8e	more renaming	2025-02-03 20:10:51 -08:00
Evan Lohn	4b82440915	finished rebase and fixed issues	2025-02-03 20:10:51 -08:00
Evan Lohn	bb6d55783e	addressing PR comments	2025-02-03 20:10:51 -08:00
Evan Lohn	2b8cd63b34	main nodes renaming	2025-02-03 20:10:51 -08:00
joachim-danswer	b0c3098693	more renaming and consolidation	2025-02-03 20:10:51 -08:00
joachim-danswer	2517aa39b2	more renamings	2025-02-03 20:10:51 -08:00
joachim-danswer	ceaaa05af0	renamings and consolidation of formatting nodes in orig question retrieval	2025-02-03 20:10:51 -08:00
joachim-danswer	3b13380051	k	2025-02-03 20:10:51 -08:00
joachim-danswer	ef6e6f9556	more renaming	2025-02-03 20:10:51 -08:00
joachim-danswer	0a6808c4c1	rename initial_sub_question_creation	2025-02-03 20:10:51 -08:00
Evan Lohn	6442c56d82	remaining small find replace fix	2025-02-03 20:10:51 -08:00
Evan Lohn	e191e514b9	fixed find and replace issue	2025-02-03 20:10:51 -08:00
Evan Lohn	f33a2ffb01	node renaming	2025-02-03 20:10:51 -08:00
joachim-danswer	0578c31522	rename retrieval & consolidate_sub_answers (initial and refinement)	2025-02-03 20:10:51 -08:00
joachim-danswer	8cbdc6d8fe	fix for refinement renaming	2025-02-03 20:10:51 -08:00
joachim-danswer	60fb06da4e	rename initial_answer_generation pt 2	2025-02-03 20:10:51 -08:00
joachim-danswer	55ed6e2294	rename initial_answer_generation	2025-02-03 20:10:50 -08:00
joachim-danswer	42780d5f97	rename of individual_sub_answer_generation	2025-02-03 20:10:50 -08:00
Evan Lohn	f050d281fd	refininement->refinement	2025-02-03 20:10:50 -08:00
joachim-danswer	3ca4d532b4	renamed directories, prompts, and small citation fix	2025-02-03 20:10:50 -08:00
pablodanswer	e3e855c526	potential question fix	2025-02-03 20:10:50 -08:00
pablodanswer	23bf50b90a	address doc	2025-02-03 20:10:50 -08:00
Yuhong Sun	c43c2320e7	Tiny nits	2025-02-03 20:10:50 -08:00
Evan Lohn	01e6e9a2ba	fixed errors on import	2025-02-03 20:10:50 -08:00
Evan Lohn	bd3b1943c4	WIP PR comments	2025-02-03 20:10:50 -08:00
Evan Lohn	1dbf561db0	fix revision to match internal alembic state	2025-02-03 20:10:50 -08:00
Evan Lohn	a43a6627eb	fix revision to match internal alembic state	2025-02-03 20:10:50 -08:00
Evan Lohn	5bff8bc8ce	collapsed db migrations post-rebase (added missing file)	2025-02-03 20:10:50 -08:00
Evan Lohn	7879ba6a77	collapsed db migrations post-rebase	2025-02-03 20:10:50 -08:00
pablodanswer	a63b341913	latex update	2025-02-03 20:10:50 -08:00
pablodanswer	c062097b2a	post rebase fix	2025-02-03 20:10:50 -08:00
Evan Lohn	48e42af8e7	fix rebase issue	2025-02-03 20:10:50 -08:00
Evan Lohn	6c7f8eaefb	first pass at dead code deletion	2025-02-03 20:10:50 -08:00
joachim-danswer	3d99ad7bc4	var initialization	2025-02-03 20:10:50 -08:00
joachim-danswer	8fea571f6e	k	2025-02-03 20:10:50 -08:00
joachim-danswer	d70bbcc2ce	k	2025-02-03 20:10:50 -08:00
joachim-danswer	73769c6cae	k	2025-02-03 20:10:50 -08:00
joachim-danswer	7e98936c58	Enrichment prompts, prompt improvements, dispatch logging & reinsert empty tool response	2025-02-03 20:10:50 -08:00
joachim-danswer	4e17fc06ff	variable renaming	2025-02-03 20:10:50 -08:00
joachim-danswer	ff4df6f3bf	fix for merge error (#3814 )	2025-02-03 20:10:50 -08:00
joachim-danswer	91b929d466	graph directory renamings	2025-02-03 20:10:50 -08:00
joachim-danswer	6bef5ca7a4	persona_prompt improvements	2025-02-03 20:10:50 -08:00
joachim-danswer	4817fa0bd1	average dispatch time collection for sub-answers	2025-02-03 20:10:50 -08:00
joachim-danswer	da4a086398	added total time to logging	2025-02-03 20:10:50 -08:00
joachim-danswer	69e8c5f0fc	agent default changes/restructuring	2025-02-03 20:10:50 -08:00
joachim-danswer	12d1186888	increased logging	2025-02-03 20:10:50 -08:00
joachim-danswer	325892a21c	cleanup of refined answer generation	2025-02-03 20:10:50 -08:00
joachim-danswer	18d92559b5	application of content limitation ion refined answer as well	2025-02-03 20:10:50 -08:00
joachim-danswer	f2aeeb7b3c	Optimizations: docs for context & history - summarize history if long - introduced cited_docs from SQ as those must be provided to answer generations - limit number of docs TODO: same for refined flow	2025-02-03 20:10:50 -08:00
Evan Lohn	110c9f7e1b	nit	2025-02-03 20:10:50 -08:00
Evan Lohn	1a22af4f27	AgentPromptConfig in Answer class	2025-02-03 20:10:50 -08:00
Evan Lohn	efa32a8c04	use reranking settings and persona during preprocessing in reranker	2025-02-03 20:10:50 -08:00
Evan Lohn	9bad12968f	removed unused files	2025-02-03 20:10:50 -08:00
Evan Lohn	f1d96343a9	always send search response	2025-02-03 20:10:50 -08:00
Evan Lohn	0496ec3bb8	remove debug	2025-02-03 20:10:50 -08:00
pablodanswer	568f927b9b	improve regeneration state	2025-02-03 20:10:50 -08:00
pablodanswer	f842e15d64	nit	2025-02-03 20:10:50 -08:00
pablodanswer	3a07093663	improved timing	2025-02-03 20:10:50 -08:00
Evan Lohn	1fe966d0f7	increased timeout to get rid of asyncio logger errors	2025-02-03 20:10:50 -08:00
joachim-danswer	812172f1bd	addressing nits of EL	2025-02-03 20:10:50 -08:00
joachim-danswer	9e9bd440f4	updated answer_comparison prompt + small cleanup	2025-02-03 20:10:50 -08:00
joachim-danswer	7487b15522	refined search + question answering as sub-graphs	2025-02-03 20:10:50 -08:00
joachim-danswer	de5ce8a613	sub-graphs for initial question/search	2025-02-03 20:10:50 -08:00
joachim-danswer	8c9577aa95	refined search + question answering as sub-graphs	2025-02-03 20:10:50 -08:00
pablodanswer	4baf3dc484	minor update	2025-02-03 20:10:50 -08:00
pablodanswer	50ef5115e7	k	2025-02-03 20:10:50 -08:00
pablodanswer	a2247363af	update switching logic	2025-02-03 20:10:50 -08:00
pablodanswer	a0af8ee91c	fix toggling edge case	2025-02-03 20:10:50 -08:00
pablodanswer	25f6543443	update bool	2025-02-03 20:10:50 -08:00
pablodanswer	d52a0b96ac	various improvements	2025-02-03 20:10:50 -08:00
pablodanswer	f14b282f0f	quick nit	2025-02-03 20:10:50 -08:00
Evan Lohn	7d494cd65e	allowed empty Search Tool for non-agentic search	2025-02-03 20:10:50 -08:00
pablodanswer	139374966f	minor update - doc ordering	2025-02-03 20:10:50 -08:00
pablodanswer	bf06710215	k	2025-02-03 20:10:50 -08:00
pablodanswer	d4e0d0db05	quick nit	2025-02-03 20:10:50 -08:00
pablodanswer	f96a3ee29a	k	2025-02-03 20:10:50 -08:00
joachim-danswer	3bf6b77319	Replaced additional limit with variable	2025-02-03 20:10:50 -08:00
joachim-danswer	3b3b0c8a87	Addressing EL's comments - created vars for a couple of agent settings - moved agent configs - created a search function	2025-02-03 20:10:50 -08:00
joachim-danswer	aa8cb44a33	taking out Extraction for now	2025-02-03 20:10:50 -08:00
joachim-danswer	fc60fd0322	earlier entity extraction & sharper generation prompts	2025-02-03 20:10:50 -08:00
joachim-danswer	46402a97c7	tmp: force agent search	2025-02-03 20:10:50 -08:00
Evan Lohn	5bf6a47948	skip reranking for <=1 doc	2025-02-03 20:10:50 -08:00
Evan Lohn	2d8486bac4	stop infos when done streaming answers	2025-02-03 20:10:50 -08:00
Evan Lohn	eea6f2749a	make field nullable	2025-02-03 20:10:50 -08:00
Evan Lohn	5e9b2e41ae	persisting refined answer improvement	2025-02-03 20:10:50 -08:00
Evan Lohn	2bbe20edc3	address JR comments	2025-02-03 20:10:50 -08:00
Evan Lohn	db2004542e	fixed chat tests	2025-02-03 20:10:50 -08:00
Evan Lohn	ddbfc65ad0	implemented top-level tool calling + force search	2025-02-03 20:10:50 -08:00
Evan Lohn	982040c792	WIP, but working basic search using initial tool choice node	2025-02-03 20:10:50 -08:00
pablodanswer	4b0a4a2741	k	2025-02-03 20:10:50 -08:00
pablodanswer	28ba01b361	updated + functional	2025-02-03 20:10:50 -08:00
pablodanswer	d32d1c6079	update- reorg	2025-02-03 20:10:50 -08:00
pablodanswer	dd494d2daa	k	2025-02-03 20:10:50 -08:00
pablodanswer	eb6dbf49a1	build fix	2025-02-03 20:10:50 -08:00
joachim-danswer	e5fa411092	EL comments addressed	2025-02-03 20:10:50 -08:00
joachim-danswer	1ced8924b3	loser verification prompt	2025-02-03 20:10:50 -08:00
joachim-danswer	3c3900fac6	turning off initial search pre route decision	2025-02-03 20:10:50 -08:00
joachim-danswer	3b298e19bc	change of sub-question answer if no docs recovered	2025-02-03 20:10:50 -08:00
joachim-danswer	71eafe04a8	various fixes from Yuhong's list	2025-02-03 20:10:50 -08:00
Yuhong Sun	80d248e02d	Copy changes	2025-02-03 20:10:50 -08:00
Evan Lohn	2032fb10da	removed print statements, fixed pass through handling	2025-02-03 20:10:50 -08:00
Evan Lohn	ca1f176c61	fixed basic flow citations and second test	2025-02-03 20:10:50 -08:00
Evan Lohn	3ced9bc28b	fix for early cancellation test; solves issue with tasks being destroyed while pending	2025-02-03 20:10:50 -08:00
pablodanswer	deea9c8c3c	add agent search frontend	2025-02-03 20:10:47 -08:00
Evan Lohn	4e47c81ed8	fix alembic history	2025-02-03 20:07:57 -08:00
joachim-danswer	00cee71c18	streaming + saving of search docs of no verified ones available - sub-questions only	2025-02-03 20:07:57 -08:00
Evan Lohn	470c4d15dd	reworked history messages in agent config	2025-02-03 20:07:57 -08:00
Evan Lohn	50bacc03b3	missed files from prev commit	2025-02-03 20:07:57 -08:00
Evan Lohn	dd260140b2	basic search restructure: WIP on fixing tests	2025-02-03 20:07:57 -08:00
joachim-danswer	8aa82be12a	prompts that even further motivates to cite docs over sub-q's	2025-02-03 20:07:57 -08:00
joachim-danswer	b7f9e431a5	pydantic for LangGraph + changed ERT extraction flow	2025-02-03 20:07:57 -08:00
joachim-danswer	b9bd2ea4e2	history added to agent flow	2025-02-03 20:07:57 -08:00
pablodanswer	e4c93bed8b	minor fixes to branch	2025-02-03 20:07:57 -08:00
Evan Lohn	4fd6e36c2f	second clean commit	2025-02-03 20:07:57 -08:00