fix(backend): add 429/5xx patterns to is_transient_api_error and add config validators

- Add rate-limit (429) and server error (5xx) string patterns to is_transient_api_error() so the fallback retry path catches these in addition to connection-level errors (ECONNRESET). - Add ge/le validators on max_turns (1-500) and max_budget_usd (0.01-100.0) to prevent misconfiguration. - Rename max_transient -> max_transient_retries and _can_retry_transient() -> _next_transient_backoff() for clarity. - Add comprehensive tests for all new transient patterns and config boundary validation.
2026-04-08 03:00:28 -04:00 · 2026-04-02 06:21:51 +02:00
parent 32bfe1b209
commit b3d9e9e856
4 changed files with 152 additions and 11 deletions
--- a/autogpt_platform/backend/backend/copilot/config.py
+++ b/autogpt_platform/backend/backend/copilot/config.py
@@ -136,11 +136,15 @@ class ChatConfig(BaseSettings):
    )
    claude_agent_max_turns: int = Field(
        default=50,
+        ge=1,
+        le=500,
        description="Maximum number of agentic turns (tool-use loops) per query. "
        "Prevents runaway tool loops from burning budget.",
    )
    claude_agent_max_budget_usd: float = Field(
        default=5.0,
+        ge=0.01,
+        le=100.0,
        description="Maximum spend in USD per SDK query. The CLI aborts the "
        "request if this budget is exceeded.",
    )
--- a/autogpt_platform/backend/backend/copilot/constants.py
+++ b/autogpt_platform/backend/backend/copilot/constants.py
@@ -44,12 +44,32 @@ def parse_node_id_from_exec_id(node_exec_id: str) -> str:
 # Transient Anthropic API error detection
 # ---------------------------------------------------------------------------
 # Patterns in error text that indicate a transient Anthropic API error
-# (ECONNRESET / dropped TCP connection) which is retryable.
+# which is retryable.  Covers:
+#   - Connection-level: ECONNRESET, dropped TCP connections
+#   - HTTP 429: rate-limit / too-many-requests
+#   - HTTP 5xx: server errors, overloaded
 _TRANSIENT_ERROR_PATTERNS = (
+    # Connection-level
    "socket connection was closed unexpectedly",
    "ECONNRESET",
    "connection was forcibly closed",
    "network socket disconnected",
+    # 429 rate-limit patterns
+    "rate limit",
+    "rate_limit",
+    "too many requests",
+    "status code 429",
+    # 5xx server error patterns
+    "overloaded",
+    "internal server error",
+    "bad gateway",
+    "service unavailable",
+    "gateway timeout",
+    "status code 529",
+    "status code 500",
+    "status code 502",
+    "status code 503",
+    "status code 504",
 )

 FRIENDLY_TRANSIENT_MSG = "Anthropic connection interrupted — please retry"
--- a/autogpt_platform/backend/backend/copilot/sdk/p0_guardrails_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/p0_guardrails_test.py
@@ -2,7 +2,11 @@

 from unittest.mock import patch

+import pytest
+from pydantic import ValidationError
+
 from backend.copilot.config import ChatConfig
+from backend.copilot.constants import is_transient_api_error


 def _make_config(**overrides) -> ChatConfig:
@@ -258,3 +262,116 @@ class TestBuildSdkEnv:
            assert isinstance(env, dict)
            env["CLAUDE_CODE_TMPDIR"] = "/tmp/test"
            assert env["CLAUDE_CODE_TMPDIR"] == "/tmp/test"
+
+
+# ---------------------------------------------------------------------------
+# is_transient_api_error
+# ---------------------------------------------------------------------------
+
+
+class TestIsTransientApiError:
+    """Verify that is_transient_api_error detects all transient patterns."""
+
+    @pytest.mark.parametrize(
+        "error_text",
+        [
+            "socket connection was closed unexpectedly",
+            "ECONNRESET",
+            "connection was forcibly closed",
+            "network socket disconnected",
+        ],
+    )
+    def test_connection_level_errors(self, error_text: str):
+        assert is_transient_api_error(error_text)
+
+    @pytest.mark.parametrize(
+        "error_text",
+        [
+            "rate limit exceeded",
+            "rate_limit_error",
+            "Too Many Requests",
+            "status code 429",
+        ],
+    )
+    def test_429_rate_limit_errors(self, error_text: str):
+        assert is_transient_api_error(error_text)
+
+    @pytest.mark.parametrize(
+        "error_text",
+        [
+            "API is overloaded",
+            "Internal Server Error",
+            "Bad Gateway",
+            "Service Unavailable",
+            "Gateway Timeout",
+            "status code 529",
+            "status code 500",
+            "status code 502",
+            "status code 503",
+            "status code 504",
+        ],
+    )
+    def test_5xx_server_errors(self, error_text: str):
+        assert is_transient_api_error(error_text)
+
+    @pytest.mark.parametrize(
+        "error_text",
+        [
+            "invalid_api_key",
+            "Authentication failed",
+            "prompt is too long",
+            "model not found",
+            "",
+        ],
+    )
+    def test_non_transient_errors(self, error_text: str):
+        assert not is_transient_api_error(error_text)
+
+    def test_case_insensitive(self):
+        assert is_transient_api_error("SOCKET CONNECTION WAS CLOSED UNEXPECTEDLY")
+        assert is_transient_api_error("econnreset")
+
+
+# ---------------------------------------------------------------------------
+# Config validators for max_turns / max_budget_usd
+# ---------------------------------------------------------------------------
+
+
+class TestConfigValidators:
+    """Verify ge/le bounds on max_turns and max_budget_usd."""
+
+    def test_max_turns_rejects_zero(self):
+        with pytest.raises(ValidationError):
+            _make_config(claude_agent_max_turns=0)
+
+    def test_max_turns_rejects_negative(self):
+        with pytest.raises(ValidationError):
+            _make_config(claude_agent_max_turns=-1)
+
+    def test_max_turns_rejects_above_500(self):
+        with pytest.raises(ValidationError):
+            _make_config(claude_agent_max_turns=501)
+
+    def test_max_turns_accepts_boundary_values(self):
+        cfg_low = _make_config(claude_agent_max_turns=1)
+        assert cfg_low.claude_agent_max_turns == 1
+        cfg_high = _make_config(claude_agent_max_turns=500)
+        assert cfg_high.claude_agent_max_turns == 500
+
+    def test_max_budget_rejects_zero(self):
+        with pytest.raises(ValidationError):
+            _make_config(claude_agent_max_budget_usd=0.0)
+
+    def test_max_budget_rejects_negative(self):
+        with pytest.raises(ValidationError):
+            _make_config(claude_agent_max_budget_usd=-1.0)
+
+    def test_max_budget_rejects_above_100(self):
+        with pytest.raises(ValidationError):
+            _make_config(claude_agent_max_budget_usd=100.01)
+
+    def test_max_budget_accepts_boundary_values(self):
+        cfg_low = _make_config(claude_agent_max_budget_usd=0.01)
+        assert cfg_low.claude_agent_max_budget_usd == 0.01
+        cfg_high = _make_config(claude_agent_max_budget_usd=100.0)
+        assert cfg_high.claude_agent_max_budget_usd == 100.0
--- a/autogpt_platform/backend/backend/copilot/sdk/service.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/service.py
@@ -2027,20 +2027,20 @@ async def stream_chat_completion_sdk(
        # Transient retry helper — deduplicates the logic shared between
        # _HandledStreamError and the generic except-Exception handler.
        transient_retries = 0
-        max_transient = config.claude_agent_max_transient_retries
+        max_transient_retries = config.claude_agent_max_transient_retries

-        def _can_retry_transient() -> int | None:
-            """Check if a transient retry is possible.
+        def _next_transient_backoff() -> int | None:
+            """Return the next backoff delay in seconds, or ``None`` to surface the error.

            Returns the backoff seconds if a retry should be attempted,
-            or ``None`` if the error should be surfaced to the user.
-            Mutates outer ``transient_retries`` via nonlocal.
+            or ``None`` if retries are exhausted or events were already
+            yielded.  Mutates outer ``transient_retries`` via nonlocal.
            """
            nonlocal transient_retries
            if events_yielded > 0:
                return None
            transient_retries += 1
-            if transient_retries > max_transient:
+            if transient_retries > max_transient_retries:
                return None
            return 2 ** (transient_retries - 1)  # 1s, 2s, 4s, ...

@@ -2171,14 +2171,14 @@ async def stream_chat_completion_sdk(
                if exc.code == "transient_api_error" or is_transient_api_error(
                    str(exc)
                ):
-                    backoff = _can_retry_transient()
+                    backoff = _next_transient_backoff()
                    if backoff is not None:
                        logger.warning(
                            "%s Transient error — retrying in %ds (%d/%d)",
                            log_prefix,
                            backoff,
                            transient_retries,
-                            max_transient,
+                            max_transient_retries,
                        )
                        yield StreamStatus(
                            message=f"Connection interrupted, retrying in {backoff}s…"
@@ -2247,14 +2247,14 @@ async def stream_chat_completion_sdk(
                # Transient API errors (ECONNRESET, 429, 5xx) — retry
                # with exponential backoff via the shared helper.
                if is_transient:
-                    backoff = _can_retry_transient()
+                    backoff = _next_transient_backoff()
                    if backoff is not None:
                        logger.warning(
                            "%s Transient exception — retrying in %ds (%d/%d)",
                            log_prefix,
                            backoff,
                            transient_retries,
-                            max_transient,
+                            max_transient_retries,
                        )
                        yield StreamStatus(
                            message=f"Connection interrupted, retrying "