fix(chat): improve LLM_SOCKET_READ_TIMEOUT user experience

2026-04-17 23:46:47 +00:00 · 2026-04-17 12:35:02 -07:00
4 changed files with 34 additions and 3 deletions
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -93,6 +93,7 @@ from onyx.llm.factory import get_llm_for_persona
 from onyx.llm.factory import get_llm_token_counter
 from onyx.llm.interfaces import LLM
 from onyx.llm.interfaces import LLMUserIdentity
+from onyx.llm.multi_llm import LLMTimeoutError
 from onyx.llm.override_models import LLMOverride
 from onyx.llm.request_context import reset_llm_mock_response
 from onyx.llm.request_context import set_llm_mock_response
@@ -1166,6 +1167,32 @@ def _run_models(
            else:
                if item is _MODEL_DONE:
                    models_remaining -= 1
+                elif isinstance(item, LLMTimeoutError):
+                    model_llm = setup.llms[model_idx]
+                    error_msg = (
+                        "The LLM took too long to respond. "
+                        "If you're running a local model, try increasing the "
+                        "LLM_SOCKET_READ_TIMEOUT environment variable "
+                        "(current default: 120 seconds)."
+                    )
+                    stack_trace = "".join(
+                        traceback.format_exception(type(item), item, item.__traceback__)
+                    )
+                    if model_llm.config.api_key and len(model_llm.config.api_key) > 2:
+                        stack_trace = stack_trace.replace(
+                            model_llm.config.api_key, "[REDACTED_API_KEY]"
+                        )
+                    yield StreamingError(
+                        error=error_msg,
+                        stack_trace=stack_trace,
+                        error_code="CONNECTION_ERROR",
+                        is_retryable=True,
+                        details={
+                            "model": model_llm.config.model_name,
+                            "provider": model_llm.config.model_provider,
+                            "model_index": model_idx,
+                        },
+                    )
                elif isinstance(item, Exception):
                    # Yield a tagged error for this model but keep the other models running.
                    # Do NOT decrement models_remaining — _run_model's finally always posts
--- a/backend/onyx/llm/utils.py
+++ b/backend/onyx/llm/utils.py
@@ -290,7 +290,11 @@ def litellm_exception_to_error_msg(
        error_code = "BUDGET_EXCEEDED"
        is_retryable = False
    elif isinstance(core_exception, Timeout):
-        error_msg = "Request timed out: The operation took too long to complete. Please try again."
+        error_msg = (
+            "The LLM took too long to respond. "
+            "If you're running a local model, try increasing the "
+            "LLM_SOCKET_READ_TIMEOUT environment variable (current default: 120 seconds)."
+        )
        error_code = "CONNECTION_ERROR"
        is_retryable = True
    elif isinstance(core_exception, APIError):
--- a/deployment/docker_compose/env.template
+++ b/deployment/docker_compose/env.template
@@ -172,7 +172,7 @@ LOG_ONYX_MODEL_INTERACTIONS=False

 ## Gen AI Settings
 # GEN_AI_MAX_TOKENS=
-# LLM_SOCKET_READ_TIMEOUT=
+LLM_SOCKET_READ_TIMEOUT=120
 # MAX_CHUNKS_FED_TO_CHAT=
 # DISABLE_LITELLM_STREAMING=
 # LITELLM_EXTRA_HEADERS=
--- a/deployment/helm/charts/onyx/values.yaml
+++ b/deployment/helm/charts/onyx/values.yaml
@@ -1262,7 +1262,7 @@ configMap:
  S3_FILE_STORE_BUCKET_NAME: ""
  # Gen AI Settings
  GEN_AI_MAX_TOKENS: ""
-  LLM_SOCKET_READ_TIMEOUT: "60"
+  LLM_SOCKET_READ_TIMEOUT: "120"
  MAX_CHUNKS_FED_TO_CHAT: ""
  # Query Options
  DOC_TIME_DECAY: ""