diff --git a/autogpt_platform/backend/backend/api/features/chat/service.py b/autogpt_platform/backend/backend/api/features/chat/service.py index f1f3156713..7d215737a3 100644 --- a/autogpt_platform/backend/backend/api/features/chat/service.py +++ b/autogpt_platform/backend/backend/api/features/chat/service.py @@ -1419,6 +1419,7 @@ async def _stream_chat_chunks( return except Exception as e: last_error = e + if _is_retryable_error(e) and retry_count < MAX_RETRIES: retry_count += 1 # Calculate delay with exponential backoff @@ -1434,12 +1435,24 @@ async def _stream_chat_chunks( continue # Retry the stream else: # Non-retryable error or max retries exceeded - logger.error( - f"Error in stream (not retrying): {e!s}", - exc_info=True, + _log_api_error( + error=e, + session_id=session.session_id if session else None, + message_count=len(messages) if messages else None, + model=model, + retry_count=retry_count, ) error_code = None error_text = str(e) + + error_details = _extract_api_error_details(e) + if error_details.get("response_body"): + body = error_details["response_body"] + if isinstance(body, dict) and body.get("error", {}).get( + "message" + ): + error_text = body["error"]["message"] + if _is_region_blocked_error(e): error_code = "MODEL_NOT_AVAILABLE_REGION" error_text = ( @@ -1456,9 +1469,12 @@ async def _stream_chat_chunks( # If we exit the retry loop without returning, it means we exhausted retries if last_error: - logger.error( - f"Max retries ({MAX_RETRIES}) exceeded. Last error: {last_error!s}", - exc_info=True, + _log_api_error( + error=last_error, + session_id=session.session_id if session else None, + message_count=len(messages) if messages else None, + model=model, + retry_count=MAX_RETRIES, ) yield StreamError(errorText=f"Max retries exceeded: {last_error!s}") yield StreamFinish() @@ -1879,6 +1895,7 @@ async def _generate_llm_continuation( break # Success, exit retry loop except Exception as e: last_error = e + if _is_retryable_error(e) and retry_count < MAX_RETRIES: retry_count += 1 delay = min( @@ -1892,17 +1909,23 @@ async def _generate_llm_continuation( await asyncio.sleep(delay) continue else: - # Non-retryable error - log and exit gracefully - logger.error( - f"Non-retryable error in LLM continuation: {e!s}", - exc_info=True, + # Non-retryable error - log details and exit gracefully + _log_api_error( + error=e, + session_id=session_id, + message_count=len(messages) if messages else None, + model=config.model, + retry_count=retry_count, ) return if last_error: - logger.error( - f"Max retries ({MAX_RETRIES}) exceeded for LLM continuation. " - f"Last error: {last_error!s}" + _log_api_error( + error=last_error, + session_id=session_id, + message_count=len(messages) if messages else None, + model=config.model, + retry_count=MAX_RETRIES, ) return @@ -1940,3 +1963,78 @@ async def _generate_llm_continuation( except Exception as e: logger.error(f"Failed to generate LLM continuation: {e}", exc_info=True) + + +def _log_api_error( + error: Exception, + session_id: str | None = None, + message_count: int | None = None, + model: str | None = None, + retry_count: int = 0, +) -> None: + """Log detailed API error information for debugging.""" + details = _extract_api_error_details(error) + details["session_id"] = session_id + details["message_count"] = message_count + details["model"] = model + details["retry_count"] = retry_count + + if isinstance(error, RateLimitError): + logger.warning(f"Rate limit error: {details}") + elif isinstance(error, APIConnectionError): + logger.warning(f"API connection error: {details}") + elif isinstance(error, APIStatusError) and error.status_code >= 500: + logger.error(f"API server error (5xx): {details}") + else: + logger.error(f"API error: {details}") + + +def _extract_api_error_details(error: Exception) -> dict[str, Any]: + """Extract detailed information from OpenAI/OpenRouter API errors.""" + error_msg = str(error) + details: dict[str, Any] = { + "error_type": type(error).__name__, + "error_message": error_msg[:500] + "..." if len(error_msg) > 500 else error_msg, + } + + if hasattr(error, "code"): + details["code"] = error.code + if hasattr(error, "param"): + details["param"] = error.param + + if isinstance(error, APIStatusError): + details["status_code"] = error.status_code + details["request_id"] = getattr(error, "request_id", None) + + if hasattr(error, "body") and error.body: + details["response_body"] = _sanitize_error_body(error.body) + + if hasattr(error, "response") and error.response: + headers = error.response.headers + details["openrouter_provider"] = headers.get("x-openrouter-provider") + details["openrouter_model"] = headers.get("x-openrouter-model") + details["retry_after"] = headers.get("retry-after") + details["rate_limit_remaining"] = headers.get("x-ratelimit-remaining") + + return details + + +def _sanitize_error_body(body: Any, max_length: int = 2000) -> dict[str, Any] | None: + """Extract only safe fields from error response body to avoid logging sensitive data.""" + if not isinstance(body, dict): + return None + + safe_fields = ("message", "type", "code", "param", "error") + sanitized: dict[str, Any] = {} + + for field in safe_fields: + if field in body: + value = body[field] + if field == "error" and isinstance(value, dict): + sanitized[field] = _sanitize_error_body(value, max_length) + elif isinstance(value, str) and len(value) > max_length: + sanitized[field] = value[:max_length] + "...[truncated]" + else: + sanitized[field] = value + + return sanitized if sanitized else None