mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
## Summary - Detect transient Anthropic API errors (ECONNRESET, "socket connection was closed unexpectedly") across all error paths in the copilot SDK streaming loop - Replace raw technical error messages with user-friendly text: **"Anthropic connection interrupted — please retry"** - Add `retryable` field to `StreamError` model so the frontend can distinguish retryable errors - Add **"Try Again" button** on the error card for transient errors, which re-sends the last user message ### Background Sentry issue [AUTOGPT-SERVER-875](https://significant-gravitas.sentry.io/issues/AUTOGPT-SERVER-875) — 25+ events since March 13, caused by Anthropic API infrastructure instability (confirmed by their status page). Same SDK/code on dev and prod, prod-only because of higher volume of long-running streaming sessions. ### Changes **Backend (`constants.py`, `service.py`, `response_adapter.py`, `response_model.py`):** - `is_transient_api_error()` — pattern-matching helper for known transient error strings - Intercept transient errors in 3 places: `AssistantMessage.error`, stream exceptions, `BaseException` handler - Use friendly message in error markers persisted to session (so it shows properly on page refresh too) - `StreamError.retryable` field for frontend consumption **Frontend (`ChatContainer`, `ChatMessagesContainer`, `MessagePartRenderer`):** - Thread `onRetry` callback from `ChatContainer` → `ChatMessagesContainer` → `MessagePartRenderer` - Detect transient error text in error markers and show "Try Again" button via existing `ErrorCard.onRetry` - Clicking "Try Again" re-sends the last user message (backend auto-cleans stale error markers) Fixes SECRT-2128, SECRT-2129, SECRT-2130 ## Test plan - [ ] Verify transient error detection with `is_transient_api_error()` for known patterns - [ ] Confirm error card shows "Anthropic connection interrupted — please retry" instead of raw socket error - [ ] Confirm "Try Again" button appears on transient error cards - [ ] Confirm "Try Again" re-sends the last user message successfully - [ ] Confirm non-transient errors (e.g., "Prompt is too long") still show original error text without retry button - [ ] Verify error marker persists correctly on page refresh
62 lines
2.5 KiB
Python
62 lines
2.5 KiB
Python
"""Shared constants for the CoPilot module."""
|
|
|
|
# Special message prefixes for text-based markers (parsed by frontend).
|
|
# The hex suffix makes accidental LLM generation of these strings virtually
|
|
# impossible, avoiding false-positive marker detection in normal conversation.
|
|
COPILOT_ERROR_PREFIX = "[__COPILOT_ERROR_f7a1__]" # Renders as ErrorCard
|
|
COPILOT_RETRYABLE_ERROR_PREFIX = (
|
|
"[__COPILOT_RETRYABLE_ERROR_a9c2__]" # ErrorCard + retry
|
|
)
|
|
COPILOT_SYSTEM_PREFIX = "[__COPILOT_SYSTEM_e3b0__]" # Renders as system info message
|
|
|
|
# Prefix for all synthetic IDs generated by CoPilot block execution.
|
|
# Used to distinguish CoPilot-generated records from real graph execution records
|
|
# in PendingHumanReview and other tables.
|
|
COPILOT_SYNTHETIC_ID_PREFIX = "copilot-"
|
|
|
|
# Sub-prefixes for session-scoped and node-scoped synthetic IDs.
|
|
COPILOT_SESSION_PREFIX = f"{COPILOT_SYNTHETIC_ID_PREFIX}session-"
|
|
COPILOT_NODE_PREFIX = f"{COPILOT_SYNTHETIC_ID_PREFIX}node-"
|
|
|
|
# Separator used in synthetic node_exec_id to encode node_id.
|
|
# Format: "{node_id}:{random_hex}" — extract node_id via rsplit(":", 1)[0]
|
|
COPILOT_NODE_EXEC_ID_SEPARATOR = ":"
|
|
|
|
# Compaction notice messages shown to users.
|
|
COMPACTION_DONE_MSG = "Earlier messages were summarized to fit within context limits."
|
|
COMPACTION_TOOL_NAME = "context_compaction"
|
|
|
|
|
|
def is_copilot_synthetic_id(id_value: str) -> bool:
|
|
"""Check if an ID is a CoPilot synthetic ID (not from a real graph execution)."""
|
|
return id_value.startswith(COPILOT_SYNTHETIC_ID_PREFIX)
|
|
|
|
|
|
def parse_node_id_from_exec_id(node_exec_id: str) -> str:
|
|
"""Extract node_id from a synthetic node_exec_id.
|
|
|
|
Format: "{node_id}:{random_hex}" → returns "{node_id}".
|
|
"""
|
|
return node_exec_id.rsplit(COPILOT_NODE_EXEC_ID_SEPARATOR, 1)[0]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Transient Anthropic API error detection
|
|
# ---------------------------------------------------------------------------
|
|
# Patterns in error text that indicate a transient Anthropic API error
|
|
# (ECONNRESET / dropped TCP connection) which is retryable.
|
|
_TRANSIENT_ERROR_PATTERNS = (
|
|
"socket connection was closed unexpectedly",
|
|
"ECONNRESET",
|
|
"connection was forcibly closed",
|
|
"network socket disconnected",
|
|
)
|
|
|
|
FRIENDLY_TRANSIENT_MSG = "Anthropic connection interrupted — please retry"
|
|
|
|
|
|
def is_transient_api_error(error_text: str) -> bool:
|
|
"""Return True if *error_text* matches a known transient Anthropic API error."""
|
|
lower = error_text.lower()
|
|
return any(pat.lower() in lower for pat in _TRANSIENT_ERROR_PATTERNS)
|