mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(backend): add 429/5xx patterns to is_transient_api_error and add config validators
- Add rate-limit (429) and server error (5xx) string patterns to is_transient_api_error() so the fallback retry path catches these in addition to connection-level errors (ECONNRESET). - Add ge/le validators on max_turns (1-500) and max_budget_usd (0.01-100.0) to prevent misconfiguration. - Rename max_transient -> max_transient_retries and _can_retry_transient() -> _next_transient_backoff() for clarity. - Add comprehensive tests for all new transient patterns and config boundary validation.
This commit is contained in:
@@ -136,11 +136,15 @@ class ChatConfig(BaseSettings):
|
||||
)
|
||||
claude_agent_max_turns: int = Field(
|
||||
default=50,
|
||||
ge=1,
|
||||
le=500,
|
||||
description="Maximum number of agentic turns (tool-use loops) per query. "
|
||||
"Prevents runaway tool loops from burning budget.",
|
||||
)
|
||||
claude_agent_max_budget_usd: float = Field(
|
||||
default=5.0,
|
||||
ge=0.01,
|
||||
le=100.0,
|
||||
description="Maximum spend in USD per SDK query. The CLI aborts the "
|
||||
"request if this budget is exceeded.",
|
||||
)
|
||||
|
||||
@@ -44,12 +44,32 @@ def parse_node_id_from_exec_id(node_exec_id: str) -> str:
|
||||
# Transient Anthropic API error detection
|
||||
# ---------------------------------------------------------------------------
|
||||
# Patterns in error text that indicate a transient Anthropic API error
|
||||
# (ECONNRESET / dropped TCP connection) which is retryable.
|
||||
# which is retryable. Covers:
|
||||
# - Connection-level: ECONNRESET, dropped TCP connections
|
||||
# - HTTP 429: rate-limit / too-many-requests
|
||||
# - HTTP 5xx: server errors, overloaded
|
||||
_TRANSIENT_ERROR_PATTERNS = (
|
||||
# Connection-level
|
||||
"socket connection was closed unexpectedly",
|
||||
"ECONNRESET",
|
||||
"connection was forcibly closed",
|
||||
"network socket disconnected",
|
||||
# 429 rate-limit patterns
|
||||
"rate limit",
|
||||
"rate_limit",
|
||||
"too many requests",
|
||||
"status code 429",
|
||||
# 5xx server error patterns
|
||||
"overloaded",
|
||||
"internal server error",
|
||||
"bad gateway",
|
||||
"service unavailable",
|
||||
"gateway timeout",
|
||||
"status code 529",
|
||||
"status code 500",
|
||||
"status code 502",
|
||||
"status code 503",
|
||||
"status code 504",
|
||||
)
|
||||
|
||||
FRIENDLY_TRANSIENT_MSG = "Anthropic connection interrupted — please retry"
|
||||
|
||||
@@ -2,7 +2,11 @@
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from backend.copilot.config import ChatConfig
|
||||
from backend.copilot.constants import is_transient_api_error
|
||||
|
||||
|
||||
def _make_config(**overrides) -> ChatConfig:
|
||||
@@ -258,3 +262,116 @@ class TestBuildSdkEnv:
|
||||
assert isinstance(env, dict)
|
||||
env["CLAUDE_CODE_TMPDIR"] = "/tmp/test"
|
||||
assert env["CLAUDE_CODE_TMPDIR"] == "/tmp/test"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# is_transient_api_error
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestIsTransientApiError:
|
||||
"""Verify that is_transient_api_error detects all transient patterns."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"error_text",
|
||||
[
|
||||
"socket connection was closed unexpectedly",
|
||||
"ECONNRESET",
|
||||
"connection was forcibly closed",
|
||||
"network socket disconnected",
|
||||
],
|
||||
)
|
||||
def test_connection_level_errors(self, error_text: str):
|
||||
assert is_transient_api_error(error_text)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"error_text",
|
||||
[
|
||||
"rate limit exceeded",
|
||||
"rate_limit_error",
|
||||
"Too Many Requests",
|
||||
"status code 429",
|
||||
],
|
||||
)
|
||||
def test_429_rate_limit_errors(self, error_text: str):
|
||||
assert is_transient_api_error(error_text)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"error_text",
|
||||
[
|
||||
"API is overloaded",
|
||||
"Internal Server Error",
|
||||
"Bad Gateway",
|
||||
"Service Unavailable",
|
||||
"Gateway Timeout",
|
||||
"status code 529",
|
||||
"status code 500",
|
||||
"status code 502",
|
||||
"status code 503",
|
||||
"status code 504",
|
||||
],
|
||||
)
|
||||
def test_5xx_server_errors(self, error_text: str):
|
||||
assert is_transient_api_error(error_text)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"error_text",
|
||||
[
|
||||
"invalid_api_key",
|
||||
"Authentication failed",
|
||||
"prompt is too long",
|
||||
"model not found",
|
||||
"",
|
||||
],
|
||||
)
|
||||
def test_non_transient_errors(self, error_text: str):
|
||||
assert not is_transient_api_error(error_text)
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert is_transient_api_error("SOCKET CONNECTION WAS CLOSED UNEXPECTEDLY")
|
||||
assert is_transient_api_error("econnreset")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config validators for max_turns / max_budget_usd
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestConfigValidators:
|
||||
"""Verify ge/le bounds on max_turns and max_budget_usd."""
|
||||
|
||||
def test_max_turns_rejects_zero(self):
|
||||
with pytest.raises(ValidationError):
|
||||
_make_config(claude_agent_max_turns=0)
|
||||
|
||||
def test_max_turns_rejects_negative(self):
|
||||
with pytest.raises(ValidationError):
|
||||
_make_config(claude_agent_max_turns=-1)
|
||||
|
||||
def test_max_turns_rejects_above_500(self):
|
||||
with pytest.raises(ValidationError):
|
||||
_make_config(claude_agent_max_turns=501)
|
||||
|
||||
def test_max_turns_accepts_boundary_values(self):
|
||||
cfg_low = _make_config(claude_agent_max_turns=1)
|
||||
assert cfg_low.claude_agent_max_turns == 1
|
||||
cfg_high = _make_config(claude_agent_max_turns=500)
|
||||
assert cfg_high.claude_agent_max_turns == 500
|
||||
|
||||
def test_max_budget_rejects_zero(self):
|
||||
with pytest.raises(ValidationError):
|
||||
_make_config(claude_agent_max_budget_usd=0.0)
|
||||
|
||||
def test_max_budget_rejects_negative(self):
|
||||
with pytest.raises(ValidationError):
|
||||
_make_config(claude_agent_max_budget_usd=-1.0)
|
||||
|
||||
def test_max_budget_rejects_above_100(self):
|
||||
with pytest.raises(ValidationError):
|
||||
_make_config(claude_agent_max_budget_usd=100.01)
|
||||
|
||||
def test_max_budget_accepts_boundary_values(self):
|
||||
cfg_low = _make_config(claude_agent_max_budget_usd=0.01)
|
||||
assert cfg_low.claude_agent_max_budget_usd == 0.01
|
||||
cfg_high = _make_config(claude_agent_max_budget_usd=100.0)
|
||||
assert cfg_high.claude_agent_max_budget_usd == 100.0
|
||||
|
||||
@@ -2027,20 +2027,20 @@ async def stream_chat_completion_sdk(
|
||||
# Transient retry helper — deduplicates the logic shared between
|
||||
# _HandledStreamError and the generic except-Exception handler.
|
||||
transient_retries = 0
|
||||
max_transient = config.claude_agent_max_transient_retries
|
||||
max_transient_retries = config.claude_agent_max_transient_retries
|
||||
|
||||
def _can_retry_transient() -> int | None:
|
||||
"""Check if a transient retry is possible.
|
||||
def _next_transient_backoff() -> int | None:
|
||||
"""Return the next backoff delay in seconds, or ``None`` to surface the error.
|
||||
|
||||
Returns the backoff seconds if a retry should be attempted,
|
||||
or ``None`` if the error should be surfaced to the user.
|
||||
Mutates outer ``transient_retries`` via nonlocal.
|
||||
or ``None`` if retries are exhausted or events were already
|
||||
yielded. Mutates outer ``transient_retries`` via nonlocal.
|
||||
"""
|
||||
nonlocal transient_retries
|
||||
if events_yielded > 0:
|
||||
return None
|
||||
transient_retries += 1
|
||||
if transient_retries > max_transient:
|
||||
if transient_retries > max_transient_retries:
|
||||
return None
|
||||
return 2 ** (transient_retries - 1) # 1s, 2s, 4s, ...
|
||||
|
||||
@@ -2171,14 +2171,14 @@ async def stream_chat_completion_sdk(
|
||||
if exc.code == "transient_api_error" or is_transient_api_error(
|
||||
str(exc)
|
||||
):
|
||||
backoff = _can_retry_transient()
|
||||
backoff = _next_transient_backoff()
|
||||
if backoff is not None:
|
||||
logger.warning(
|
||||
"%s Transient error — retrying in %ds (%d/%d)",
|
||||
log_prefix,
|
||||
backoff,
|
||||
transient_retries,
|
||||
max_transient,
|
||||
max_transient_retries,
|
||||
)
|
||||
yield StreamStatus(
|
||||
message=f"Connection interrupted, retrying in {backoff}s…"
|
||||
@@ -2247,14 +2247,14 @@ async def stream_chat_completion_sdk(
|
||||
# Transient API errors (ECONNRESET, 429, 5xx) — retry
|
||||
# with exponential backoff via the shared helper.
|
||||
if is_transient:
|
||||
backoff = _can_retry_transient()
|
||||
backoff = _next_transient_backoff()
|
||||
if backoff is not None:
|
||||
logger.warning(
|
||||
"%s Transient exception — retrying in %ds (%d/%d)",
|
||||
log_prefix,
|
||||
backoff,
|
||||
transient_retries,
|
||||
max_transient,
|
||||
max_transient_retries,
|
||||
)
|
||||
yield StreamStatus(
|
||||
message=f"Connection interrupted, retrying "
|
||||
|
||||
Reference in New Issue
Block a user