debug(copilot): Add detailed API error logging (#11942)

## Summary
Adds comprehensive error logging for OpenRouter/OpenAI API errors to
help diagnose issues like provider routing failures, context length
exceeded, rate limits, etc.

## Background
While investigating
[SECRT-1859](https://linear.app/autogpt/issue/SECRT-1859), we found that
when OpenRouter returns errors, the actual error details weren't being
captured or logged. Langfuse traces showed `provider_name: 'unknown'`
and `completion: null` without any insight into WHY all providers
rejected the request.

## Changes
- Add `_extract_api_error_details()` to extract rich information from
API errors including:
  - Status code and request ID
  - Response body (contains OpenRouter's actual error message)
  - OpenRouter-specific headers (provider, model)
  - Rate limit headers
- Add `_log_api_error()` helper that logs errors with context:
  - Session ID for correlation
  - Message count (helps identify context length issues)
  - Model being used
  - Retry count
- Update error handling in `_stream_chat_chunks()` and
`_generate_llm_continuation()` to use new logging
- Extract provider's error message from response body for better user
feedback

## Example log output
```
API error: {
  'error_type': 'APIStatusError',
  'error_message': 'Provider returned error',
  'status_code': 400,
  'request_id': 'req_xxx',
  'response_body': {'error': {'message': 'context_length_exceeded', 'type': 'invalid_request_error'}},
  'openrouter_provider': 'unknown',
  'session_id': '44fbb803-...',
  'message_count': 52,
  'model': 'anthropic/claude-opus-4.5',
  'retry_count': 0
}
```

## Testing
- [ ] Verified code passes linting (black, isort, ruff)
- [ ] Error details are properly extracted from different error types

## Refs
- Linear: SECRT-1859
- Thread:
https://discord.com/channels/1126875755960336515/1467066151002571034

---------

Co-authored-by: Reinier van der Leer <pwuts@agpt.co>
This commit is contained in:
Otto
2026-02-13 13:15:17 +00:00
committed by GitHub
parent 43b25b5e2f
commit dfa517300b

View File

@@ -1245,6 +1245,7 @@ async def _stream_chat_chunks(
return
except Exception as e:
last_error = e
if _is_retryable_error(e) and retry_count < MAX_RETRIES:
retry_count += 1
# Calculate delay with exponential backoff
@@ -1260,12 +1261,27 @@ async def _stream_chat_chunks(
continue # Retry the stream
else:
# Non-retryable error or max retries exceeded
logger.error(
f"Error in stream (not retrying): {e!s}",
exc_info=True,
_log_api_error(
error=e,
context="stream (not retrying)",
session_id=session.session_id if session else None,
message_count=len(messages) if messages else None,
model=model,
retry_count=retry_count,
)
error_code = None
error_text = str(e)
error_details = _extract_api_error_details(e)
if error_details.get("response_body"):
body = error_details["response_body"]
if isinstance(body, dict):
err = body.get("error")
if isinstance(err, dict) and err.get("message"):
error_text = err["message"]
elif body.get("message"):
error_text = body["message"]
if _is_region_blocked_error(e):
error_code = "MODEL_NOT_AVAILABLE_REGION"
error_text = (
@@ -1282,9 +1298,13 @@ async def _stream_chat_chunks(
# If we exit the retry loop without returning, it means we exhausted retries
if last_error:
logger.error(
f"Max retries ({MAX_RETRIES}) exceeded. Last error: {last_error!s}",
exc_info=True,
_log_api_error(
error=last_error,
context=f"stream (max retries {MAX_RETRIES} exceeded)",
session_id=session.session_id if session else None,
message_count=len(messages) if messages else None,
model=model,
retry_count=MAX_RETRIES,
)
yield StreamError(errorText=f"Max retries exceeded: {last_error!s}")
yield StreamFinish()
@@ -1857,6 +1877,7 @@ async def _generate_llm_continuation(
break # Success, exit retry loop
except Exception as e:
last_error = e
if _is_retryable_error(e) and retry_count < MAX_RETRIES:
retry_count += 1
delay = min(
@@ -1870,17 +1891,25 @@ async def _generate_llm_continuation(
await asyncio.sleep(delay)
continue
else:
# Non-retryable error - log and exit gracefully
logger.error(
f"Non-retryable error in LLM continuation: {e!s}",
exc_info=True,
# Non-retryable error - log details and exit gracefully
_log_api_error(
error=e,
context="LLM continuation (not retrying)",
session_id=session_id,
message_count=len(messages) if messages else None,
model=config.model,
retry_count=retry_count,
)
return
if last_error:
logger.error(
f"Max retries ({MAX_RETRIES}) exceeded for LLM continuation. "
f"Last error: {last_error!s}"
_log_api_error(
error=last_error,
context=f"LLM continuation (max retries {MAX_RETRIES} exceeded)",
session_id=session_id,
message_count=len(messages) if messages else None,
model=config.model,
retry_count=MAX_RETRIES,
)
return
@@ -1920,6 +1949,91 @@ async def _generate_llm_continuation(
logger.error(f"Failed to generate LLM continuation: {e}", exc_info=True)
def _log_api_error(
error: Exception,
context: str,
session_id: str | None = None,
message_count: int | None = None,
model: str | None = None,
retry_count: int = 0,
) -> None:
"""Log detailed API error information for debugging."""
details = _extract_api_error_details(error)
details["context"] = context
details["session_id"] = session_id
details["message_count"] = message_count
details["model"] = model
details["retry_count"] = retry_count
if isinstance(error, RateLimitError):
logger.warning(f"Rate limit error in {context}: {details}", exc_info=error)
elif isinstance(error, APIConnectionError):
logger.warning(f"API connection error in {context}: {details}", exc_info=error)
elif isinstance(error, APIStatusError) and error.status_code >= 500:
logger.error(f"API server error (5xx) in {context}: {details}", exc_info=error)
else:
logger.error(f"API error in {context}: {details}", exc_info=error)
def _extract_api_error_details(error: Exception) -> dict[str, Any]:
"""Extract detailed information from OpenAI/OpenRouter API errors."""
error_msg = str(error)
details: dict[str, Any] = {
"error_type": type(error).__name__,
"error_message": error_msg[:500] + "..." if len(error_msg) > 500 else error_msg,
}
if hasattr(error, "code"):
details["code"] = getattr(error, "code", None)
if hasattr(error, "param"):
details["param"] = getattr(error, "param", None)
if isinstance(error, APIStatusError):
details["status_code"] = error.status_code
details["request_id"] = getattr(error, "request_id", None)
if hasattr(error, "body") and error.body:
details["response_body"] = _sanitize_error_body(error.body)
if hasattr(error, "response") and error.response:
headers = error.response.headers
details["openrouter_provider"] = headers.get("x-openrouter-provider")
details["openrouter_model"] = headers.get("x-openrouter-model")
details["retry_after"] = headers.get("retry-after")
details["rate_limit_remaining"] = headers.get("x-ratelimit-remaining")
return details
def _sanitize_error_body(
body: Any, max_length: int = 2000
) -> dict[str, Any] | str | None:
"""Extract only safe fields from error response body to avoid logging sensitive data."""
if not isinstance(body, dict):
# Non-dict bodies (e.g., HTML error pages) - return truncated string
if body is not None:
body_str = str(body)
if len(body_str) > max_length:
return body_str[:max_length] + "...[truncated]"
return body_str
return None
safe_fields = ("message", "type", "code", "param", "error")
sanitized: dict[str, Any] = {}
for field in safe_fields:
if field in body:
value = body[field]
if field == "error" and isinstance(value, dict):
sanitized[field] = _sanitize_error_body(value, max_length)
elif isinstance(value, str) and len(value) > max_length:
sanitized[field] = value[:max_length] + "...[truncated]"
else:
sanitized[field] = value
return sanitized if sanitized else None
async def _generate_llm_continuation_with_streaming(
session_id: str,
user_id: str | None,