debug(copilot): Add detailed API error logging (#11942)

## Summary Adds comprehensive error logging for OpenRouter/OpenAI API errors to help diagnose issues like provider routing failures, context length exceeded, rate limits, etc. ## Background While investigating [SECRT-1859](https://linear.app/autogpt/issue/SECRT-1859), we found that when OpenRouter returns errors, the actual error details weren't being captured or logged. Langfuse traces showed `provider_name: 'unknown'` and `completion: null` without any insight into WHY all providers rejected the request. ## Changes - Add `_extract_api_error_details()` to extract rich information from API errors including: - Status code and request ID - Response body (contains OpenRouter's actual error message) - OpenRouter-specific headers (provider, model) - Rate limit headers - Add `_log_api_error()` helper that logs errors with context: - Session ID for correlation - Message count (helps identify context length issues) - Model being used - Retry count - Update error handling in `_stream_chat_chunks()` and `_generate_llm_continuation()` to use new logging - Extract provider's error message from response body for better user feedback ## Example log output ``` API error: { 'error_type': 'APIStatusError', 'error_message': 'Provider returned error', 'status_code': 400, 'request_id': 'req_xxx', 'response_body': {'error': {'message': 'context_length_exceeded', 'type': 'invalid_request_error'}}, 'openrouter_provider': 'unknown', 'session_id': '44fbb803-...', 'message_count': 52, 'model': 'anthropic/claude-opus-4.5', 'retry_count': 0 } ``` ## Testing - [ ] Verified code passes linting (black, isort, ruff) - [ ] Error details are properly extracted from different error types ## Refs - Linear: SECRT-1859 - Thread: https://discord.com/channels/1126875755960336515/1467066151002571034 --------- Co-authored-by: Reinier van der Leer <pwuts@agpt.co>
2026-02-14 00:35:02 -05:00 · 2026-02-13 13:15:17 +00:00
parent 43b25b5e2f
commit dfa517300b
1 changed files with 127 additions and 13 deletions
--- a/autogpt_platform/backend/backend/api/features/chat/service.py
+++ b/autogpt_platform/backend/backend/api/features/chat/service.py
@@ -1245,6 +1245,7 @@ async def _stream_chat_chunks(
                return
            except Exception as e:
                last_error = e
+
                if _is_retryable_error(e) and retry_count < MAX_RETRIES:
                    retry_count += 1
                    # Calculate delay with exponential backoff
@@ -1260,12 +1261,27 @@ async def _stream_chat_chunks(
                    continue  # Retry the stream
                else:
                    # Non-retryable error or max retries exceeded
-                    logger.error(
-                        f"Error in stream (not retrying): {e!s}",
-                        exc_info=True,
+                    _log_api_error(
+                        error=e,
+                        context="stream (not retrying)",
+                        session_id=session.session_id if session else None,
+                        message_count=len(messages) if messages else None,
+                        model=model,
+                        retry_count=retry_count,
                    )
                    error_code = None
                    error_text = str(e)
+
+                    error_details = _extract_api_error_details(e)
+                    if error_details.get("response_body"):
+                        body = error_details["response_body"]
+                        if isinstance(body, dict):
+                            err = body.get("error")
+                            if isinstance(err, dict) and err.get("message"):
+                                error_text = err["message"]
+                            elif body.get("message"):
+                                error_text = body["message"]
+
                    if _is_region_blocked_error(e):
                        error_code = "MODEL_NOT_AVAILABLE_REGION"
                        error_text = (
@@ -1282,9 +1298,13 @@ async def _stream_chat_chunks(

        # If we exit the retry loop without returning, it means we exhausted retries
        if last_error:
-            logger.error(
-                f"Max retries ({MAX_RETRIES}) exceeded. Last error: {last_error!s}",
-                exc_info=True,
+            _log_api_error(
+                error=last_error,
+                context=f"stream (max retries {MAX_RETRIES} exceeded)",
+                session_id=session.session_id if session else None,
+                message_count=len(messages) if messages else None,
+                model=model,
+                retry_count=MAX_RETRIES,
            )
            yield StreamError(errorText=f"Max retries exceeded: {last_error!s}")
            yield StreamFinish()
@@ -1857,6 +1877,7 @@ async def _generate_llm_continuation(
                break  # Success, exit retry loop
            except Exception as e:
                last_error = e
+
                if _is_retryable_error(e) and retry_count < MAX_RETRIES:
                    retry_count += 1
                    delay = min(
@@ -1870,17 +1891,25 @@ async def _generate_llm_continuation(
                    await asyncio.sleep(delay)
                    continue
                else:
-                    # Non-retryable error - log and exit gracefully
-                    logger.error(
-                        f"Non-retryable error in LLM continuation: {e!s}",
-                        exc_info=True,
+                    # Non-retryable error - log details and exit gracefully
+                    _log_api_error(
+                        error=e,
+                        context="LLM continuation (not retrying)",
+                        session_id=session_id,
+                        message_count=len(messages) if messages else None,
+                        model=config.model,
+                        retry_count=retry_count,
                    )
                    return

        if last_error:
-            logger.error(
-                f"Max retries ({MAX_RETRIES}) exceeded for LLM continuation. "
-                f"Last error: {last_error!s}"
+            _log_api_error(
+                error=last_error,
+                context=f"LLM continuation (max retries {MAX_RETRIES} exceeded)",
+                session_id=session_id,
+                message_count=len(messages) if messages else None,
+                model=config.model,
+                retry_count=MAX_RETRIES,
            )
            return

@@ -1920,6 +1949,91 @@ async def _generate_llm_continuation(
        logger.error(f"Failed to generate LLM continuation: {e}", exc_info=True)


+def _log_api_error(
+    error: Exception,
+    context: str,
+    session_id: str | None = None,
+    message_count: int | None = None,
+    model: str | None = None,
+    retry_count: int = 0,
+) -> None:
+    """Log detailed API error information for debugging."""
+    details = _extract_api_error_details(error)
+    details["context"] = context
+    details["session_id"] = session_id
+    details["message_count"] = message_count
+    details["model"] = model
+    details["retry_count"] = retry_count
+
+    if isinstance(error, RateLimitError):
+        logger.warning(f"Rate limit error in {context}: {details}", exc_info=error)
+    elif isinstance(error, APIConnectionError):
+        logger.warning(f"API connection error in {context}: {details}", exc_info=error)
+    elif isinstance(error, APIStatusError) and error.status_code >= 500:
+        logger.error(f"API server error (5xx) in {context}: {details}", exc_info=error)
+    else:
+        logger.error(f"API error in {context}: {details}", exc_info=error)
+
+
+def _extract_api_error_details(error: Exception) -> dict[str, Any]:
+    """Extract detailed information from OpenAI/OpenRouter API errors."""
+    error_msg = str(error)
+    details: dict[str, Any] = {
+        "error_type": type(error).__name__,
+        "error_message": error_msg[:500] + "..." if len(error_msg) > 500 else error_msg,
+    }
+
+    if hasattr(error, "code"):
+        details["code"] = getattr(error, "code", None)
+    if hasattr(error, "param"):
+        details["param"] = getattr(error, "param", None)
+
+    if isinstance(error, APIStatusError):
+        details["status_code"] = error.status_code
+        details["request_id"] = getattr(error, "request_id", None)
+
+        if hasattr(error, "body") and error.body:
+            details["response_body"] = _sanitize_error_body(error.body)
+
+        if hasattr(error, "response") and error.response:
+            headers = error.response.headers
+            details["openrouter_provider"] = headers.get("x-openrouter-provider")
+            details["openrouter_model"] = headers.get("x-openrouter-model")
+            details["retry_after"] = headers.get("retry-after")
+            details["rate_limit_remaining"] = headers.get("x-ratelimit-remaining")
+
+    return details
+
+
+def _sanitize_error_body(
+    body: Any, max_length: int = 2000
+) -> dict[str, Any] | str | None:
+    """Extract only safe fields from error response body to avoid logging sensitive data."""
+    if not isinstance(body, dict):
+        # Non-dict bodies (e.g., HTML error pages) - return truncated string
+        if body is not None:
+            body_str = str(body)
+            if len(body_str) > max_length:
+                return body_str[:max_length] + "...[truncated]"
+            return body_str
+        return None
+
+    safe_fields = ("message", "type", "code", "param", "error")
+    sanitized: dict[str, Any] = {}
+
+    for field in safe_fields:
+        if field in body:
+            value = body[field]
+            if field == "error" and isinstance(value, dict):
+                sanitized[field] = _sanitize_error_body(value, max_length)
+            elif isinstance(value, str) and len(value) > max_length:
+                sanitized[field] = value[:max_length] + "...[truncated]"
+            else:
+                sanitized[field] = value
+
+    return sanitized if sanitized else None
+
+
 async def _generate_llm_continuation_with_streaming(
    session_id: str,
    user_id: str | None,