fix(backend): downgrade user-caused LLM API errors to warning level (#12516)

Requested by @majdyz

Follow-up to #12513. Anthropic/OpenAI 401, 403, and 429 errors are
user-caused (bad API keys, forbidden, rate limits) and should not hit
Sentry as exceptions.

### Changes

**Changes in `blocks/llm.py`:**
- Anthropic `APIError` handler (line ~950): check `status_code` — use
`logger.warning()` for 401/403/429, keep `logger.error()` for server
errors
- Generic `Exception` handler in LLM block `run()` (line ~1467): same
pattern — `logger.warning()` for user-caused status codes,
`logger.exception()` for everything else
- Extracted `USER_ERROR_STATUS_CODES = (401, 403, 429)` module-level
constant
- Added `break` to short-circuit retry loop for user-caused errors
- Removed double-logging from inner Anthropic handler

**Changes in `blocks/test/test_llm.py`:**
- Added 8 regression tests covering 401/403/429 fast-exit and 500 retry
behavior

**Sentry issues addressed:**
- AUTOGPT-SERVER-8B6, 8B7, 8B8 — `[LLM-Block] Anthropic API error: Error
code: 401 - invalid x-api-key`
- Any OpenAI 401/403/429 errors hitting the generic exception handler

Part of SECRT-2166

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan

#### Test plan:
- [x] Unit tests for 401/403/429 Anthropic errors → warning log, no
retry
- [x] Unit tests for 500 Anthropic errors → error log, retry
- [x] Unit tests for 401/403/429 OpenAI errors → warning log, no retry
- [x] Unit tests for 500 OpenAI errors → error log, retry
- [x] Verified USER_ERROR_STATUS_CODES constant is used consistently
- [x] Verified no double-logging in Anthropic handler path

---
Co-authored-by: Zamil Majdy (@majdyz) <zamil.majdy@agpt.co>

---------

Co-authored-by: Zamil Majdy (@majdyz) <zamil.majdy@agpt.co>
This commit is contained in:
Otto
2026-03-24 10:59:04 +00:00
committed by GitHub
parent ee5382a064
commit f21a36ca37
2 changed files with 215 additions and 54 deletions

View File

@@ -49,6 +49,9 @@ settings = Settings()
logger = TruncatedLogger(logging.getLogger(__name__), "[LLM-Block]")
fmt = TextFormatter(autoescape=False)
# HTTP status codes for user-caused errors that should not be reported to Sentry.
USER_ERROR_STATUS_CODES = (401, 403, 429)
LLMProviderName = Literal[
ProviderName.AIML_API,
ProviderName.ANTHROPIC,
@@ -891,65 +894,60 @@ async def llm_call(
client = anthropic.AsyncAnthropic(
api_key=credentials.api_key.get_secret_value()
)
try:
resp = await client.messages.create(
model=llm_model.value,
system=sysprompt,
messages=messages,
max_tokens=max_tokens,
tools=an_tools,
timeout=600,
)
resp = await client.messages.create(
model=llm_model.value,
system=sysprompt,
messages=messages,
max_tokens=max_tokens,
tools=an_tools,
timeout=600,
)
if not resp.content:
raise ValueError("No content returned from Anthropic.")
if not resp.content:
raise ValueError("No content returned from Anthropic.")
tool_calls = None
for content_block in resp.content:
# Antropic is different to openai, need to iterate through
# the content blocks to find the tool calls
if content_block.type == "tool_use":
if tool_calls is None:
tool_calls = []
tool_calls.append(
ToolContentBlock(
id=content_block.id,
type=content_block.type,
function=ToolCall(
name=content_block.name,
arguments=json.dumps(content_block.input),
),
)
tool_calls = None
for content_block in resp.content:
# Antropic is different to openai, need to iterate through
# the content blocks to find the tool calls
if content_block.type == "tool_use":
if tool_calls is None:
tool_calls = []
tool_calls.append(
ToolContentBlock(
id=content_block.id,
type=content_block.type,
function=ToolCall(
name=content_block.name,
arguments=json.dumps(content_block.input),
),
)
if not tool_calls and resp.stop_reason == "tool_use":
logger.warning(
f"Tool use stop reason but no tool calls found in content. {resp}"
)
reasoning = None
for content_block in resp.content:
if hasattr(content_block, "type") and content_block.type == "thinking":
reasoning = content_block.thinking
break
return LLMResponse(
raw_response=resp,
prompt=prompt,
response=(
resp.content[0].name
if isinstance(resp.content[0], anthropic.types.ToolUseBlock)
else getattr(resp.content[0], "text", "")
),
tool_calls=tool_calls,
prompt_tokens=resp.usage.input_tokens,
completion_tokens=resp.usage.output_tokens,
reasoning=reasoning,
if not tool_calls and resp.stop_reason == "tool_use":
logger.warning(
f"Tool use stop reason but no tool calls found in content. {resp}"
)
except anthropic.APIError as e:
error_message = f"Anthropic API error: {str(e)}"
logger.error(error_message)
raise ValueError(error_message)
reasoning = None
for content_block in resp.content:
if hasattr(content_block, "type") and content_block.type == "thinking":
reasoning = content_block.thinking
break
return LLMResponse(
raw_response=resp,
prompt=prompt,
response=(
resp.content[0].name
if isinstance(resp.content[0], anthropic.types.ToolUseBlock)
else getattr(resp.content[0], "text", "")
),
tool_calls=tool_calls,
prompt_tokens=resp.usage.input_tokens,
completion_tokens=resp.usage.output_tokens,
reasoning=reasoning,
)
elif provider == "groq":
if tools:
raise ValueError("Groq does not support tools.")
@@ -1462,7 +1460,16 @@ class AIStructuredResponseGeneratorBlock(AIBlockBase):
yield "prompt", self.prompt
return
except Exception as e:
logger.exception(f"Error calling LLM: {e}")
is_user_error = (
isinstance(e, (anthropic.APIStatusError, openai.APIStatusError))
and e.status_code in USER_ERROR_STATUS_CODES
)
if is_user_error:
logger.warning(f"Error calling LLM: {e}")
error_feedback_message = f"Error calling LLM: {e}"
break
else:
logger.exception(f"Error calling LLM: {e}")
if (
"maximum context length" in str(e).lower()
or "token limit" in str(e).lower()

View File

@@ -1,9 +1,18 @@
from typing import cast
from unittest.mock import AsyncMock, MagicMock, patch
import anthropic
import httpx
import openai
import pytest
import backend.blocks.llm as llm
from backend.data.model import NodeExecutionStats
# TEST_CREDENTIALS_INPUT is a plain dict that satisfies AICredentials at runtime
# but not at the type level. Cast once here to avoid per-test suppressors.
_TEST_AI_CREDENTIALS = cast(llm.AICredentials, llm.TEST_CREDENTIALS_INPUT)
class TestLLMStatsTracking:
"""Test that LLM blocks correctly track token usage statistics."""
@@ -655,3 +664,148 @@ class TestAITextSummarizerValidation:
error_message = str(exc_info.value)
assert "Expected a string summary" in error_message
assert "received dict" in error_message
def _make_anthropic_status_error(status_code: int) -> anthropic.APIStatusError:
"""Create an anthropic.APIStatusError with the given status code."""
request = httpx.Request("POST", "https://api.anthropic.com/v1/messages")
response = httpx.Response(status_code, request=request)
return anthropic.APIStatusError(
f"Error code: {status_code}", response=response, body=None
)
def _make_openai_status_error(status_code: int) -> openai.APIStatusError:
"""Create an openai.APIStatusError with the given status code."""
response = httpx.Response(
status_code, request=httpx.Request("POST", "https://api.openai.com/v1/chat")
)
return openai.APIStatusError(
f"Error code: {status_code}", response=response, body=None
)
class TestUserErrorStatusCodeHandling:
"""Test that user-caused LLM API errors (401/403/429) break the retry loop
and are logged as warnings, while server errors (500) trigger retries."""
@pytest.mark.asyncio
@pytest.mark.parametrize("status_code", [401, 403, 429])
async def test_anthropic_user_error_breaks_retry_loop(self, status_code: int):
"""401/403/429 Anthropic errors should break immediately, not retry."""
import backend.blocks.llm as llm
block = llm.AIStructuredResponseGeneratorBlock()
call_count = 0
async def mock_llm_call(*args, **kwargs):
nonlocal call_count
call_count += 1
raise _make_anthropic_status_error(status_code)
with patch.object(block, "llm_call", new=AsyncMock(side_effect=mock_llm_call)):
input_data = llm.AIStructuredResponseGeneratorBlock.Input(
prompt="Test",
expected_format={"key": "desc"},
model=llm.DEFAULT_LLM_MODEL,
credentials=_TEST_AI_CREDENTIALS,
retry=3,
)
with pytest.raises(RuntimeError):
async for _ in block.run(input_data, credentials=llm.TEST_CREDENTIALS):
pass
assert (
call_count == 1
), f"Expected exactly 1 call for status {status_code}, got {call_count}"
@pytest.mark.asyncio
@pytest.mark.parametrize("status_code", [401, 403, 429])
async def test_openai_user_error_breaks_retry_loop(self, status_code: int):
"""401/403/429 OpenAI errors should break immediately, not retry."""
import backend.blocks.llm as llm
block = llm.AIStructuredResponseGeneratorBlock()
call_count = 0
async def mock_llm_call(*args, **kwargs):
nonlocal call_count
call_count += 1
raise _make_openai_status_error(status_code)
with patch.object(block, "llm_call", new=AsyncMock(side_effect=mock_llm_call)):
input_data = llm.AIStructuredResponseGeneratorBlock.Input(
prompt="Test",
expected_format={"key": "desc"},
model=llm.DEFAULT_LLM_MODEL,
credentials=_TEST_AI_CREDENTIALS,
retry=3,
)
with pytest.raises(RuntimeError):
async for _ in block.run(input_data, credentials=llm.TEST_CREDENTIALS):
pass
assert (
call_count == 1
), f"Expected exactly 1 call for status {status_code}, got {call_count}"
@pytest.mark.asyncio
async def test_server_error_retries(self):
"""500 errors should be retried (not break immediately)."""
import backend.blocks.llm as llm
block = llm.AIStructuredResponseGeneratorBlock()
call_count = 0
async def mock_llm_call(*args, **kwargs):
nonlocal call_count
call_count += 1
raise _make_anthropic_status_error(500)
with patch.object(block, "llm_call", new=AsyncMock(side_effect=mock_llm_call)):
input_data = llm.AIStructuredResponseGeneratorBlock.Input(
prompt="Test",
expected_format={"key": "desc"},
model=llm.DEFAULT_LLM_MODEL,
credentials=_TEST_AI_CREDENTIALS,
retry=3,
)
with pytest.raises(RuntimeError):
async for _ in block.run(input_data, credentials=llm.TEST_CREDENTIALS):
pass
assert (
call_count > 1
), f"Expected multiple retry attempts for 500, got {call_count}"
@pytest.mark.asyncio
async def test_user_error_logs_warning_not_exception(self):
"""User-caused errors should log with logger.warning, not logger.exception."""
import backend.blocks.llm as llm
block = llm.AIStructuredResponseGeneratorBlock()
async def mock_llm_call(*args, **kwargs):
raise _make_anthropic_status_error(401)
with patch.object(block, "llm_call", new=AsyncMock(side_effect=mock_llm_call)):
input_data = llm.AIStructuredResponseGeneratorBlock.Input(
prompt="Test",
expected_format={"key": "desc"},
model=llm.DEFAULT_LLM_MODEL,
credentials=_TEST_AI_CREDENTIALS,
)
with (
patch.object(llm.logger, "warning") as mock_warning,
patch.object(llm.logger, "exception") as mock_exception,
pytest.raises(RuntimeError),
):
async for _ in block.run(input_data, credentials=llm.TEST_CREDENTIALS):
pass
mock_warning.assert_called_once()
mock_exception.assert_not_called()