fix(backend): add 429/5xx patterns to is_transient_api_error and add config validators

- Add rate-limit (429) and server error (5xx) string patterns to
  is_transient_api_error() so the fallback retry path catches these
  in addition to connection-level errors (ECONNRESET).
- Add ge/le validators on max_turns (1-500) and max_budget_usd
  (0.01-100.0) to prevent misconfiguration.
- Rename max_transient -> max_transient_retries and
  _can_retry_transient() -> _next_transient_backoff() for clarity.
- Add comprehensive tests for all new transient patterns and config
  boundary validation.
This commit is contained in:
Zamil Majdy
2026-04-02 06:21:51 +02:00
parent 32bfe1b209
commit b3d9e9e856
4 changed files with 152 additions and 11 deletions

View File

@@ -136,11 +136,15 @@ class ChatConfig(BaseSettings):
)
claude_agent_max_turns: int = Field(
default=50,
ge=1,
le=500,
description="Maximum number of agentic turns (tool-use loops) per query. "
"Prevents runaway tool loops from burning budget.",
)
claude_agent_max_budget_usd: float = Field(
default=5.0,
ge=0.01,
le=100.0,
description="Maximum spend in USD per SDK query. The CLI aborts the "
"request if this budget is exceeded.",
)

View File

@@ -44,12 +44,32 @@ def parse_node_id_from_exec_id(node_exec_id: str) -> str:
# Transient Anthropic API error detection
# ---------------------------------------------------------------------------
# Patterns in error text that indicate a transient Anthropic API error
# (ECONNRESET / dropped TCP connection) which is retryable.
# which is retryable. Covers:
# - Connection-level: ECONNRESET, dropped TCP connections
# - HTTP 429: rate-limit / too-many-requests
# - HTTP 5xx: server errors, overloaded
_TRANSIENT_ERROR_PATTERNS = (
# Connection-level
"socket connection was closed unexpectedly",
"ECONNRESET",
"connection was forcibly closed",
"network socket disconnected",
# 429 rate-limit patterns
"rate limit",
"rate_limit",
"too many requests",
"status code 429",
# 5xx server error patterns
"overloaded",
"internal server error",
"bad gateway",
"service unavailable",
"gateway timeout",
"status code 529",
"status code 500",
"status code 502",
"status code 503",
"status code 504",
)
FRIENDLY_TRANSIENT_MSG = "Anthropic connection interrupted — please retry"

View File

@@ -2,7 +2,11 @@
from unittest.mock import patch
import pytest
from pydantic import ValidationError
from backend.copilot.config import ChatConfig
from backend.copilot.constants import is_transient_api_error
def _make_config(**overrides) -> ChatConfig:
@@ -258,3 +262,116 @@ class TestBuildSdkEnv:
assert isinstance(env, dict)
env["CLAUDE_CODE_TMPDIR"] = "/tmp/test"
assert env["CLAUDE_CODE_TMPDIR"] == "/tmp/test"
# ---------------------------------------------------------------------------
# is_transient_api_error
# ---------------------------------------------------------------------------
class TestIsTransientApiError:
"""Verify that is_transient_api_error detects all transient patterns."""
@pytest.mark.parametrize(
"error_text",
[
"socket connection was closed unexpectedly",
"ECONNRESET",
"connection was forcibly closed",
"network socket disconnected",
],
)
def test_connection_level_errors(self, error_text: str):
assert is_transient_api_error(error_text)
@pytest.mark.parametrize(
"error_text",
[
"rate limit exceeded",
"rate_limit_error",
"Too Many Requests",
"status code 429",
],
)
def test_429_rate_limit_errors(self, error_text: str):
assert is_transient_api_error(error_text)
@pytest.mark.parametrize(
"error_text",
[
"API is overloaded",
"Internal Server Error",
"Bad Gateway",
"Service Unavailable",
"Gateway Timeout",
"status code 529",
"status code 500",
"status code 502",
"status code 503",
"status code 504",
],
)
def test_5xx_server_errors(self, error_text: str):
assert is_transient_api_error(error_text)
@pytest.mark.parametrize(
"error_text",
[
"invalid_api_key",
"Authentication failed",
"prompt is too long",
"model not found",
"",
],
)
def test_non_transient_errors(self, error_text: str):
assert not is_transient_api_error(error_text)
def test_case_insensitive(self):
assert is_transient_api_error("SOCKET CONNECTION WAS CLOSED UNEXPECTEDLY")
assert is_transient_api_error("econnreset")
# ---------------------------------------------------------------------------
# Config validators for max_turns / max_budget_usd
# ---------------------------------------------------------------------------
class TestConfigValidators:
"""Verify ge/le bounds on max_turns and max_budget_usd."""
def test_max_turns_rejects_zero(self):
with pytest.raises(ValidationError):
_make_config(claude_agent_max_turns=0)
def test_max_turns_rejects_negative(self):
with pytest.raises(ValidationError):
_make_config(claude_agent_max_turns=-1)
def test_max_turns_rejects_above_500(self):
with pytest.raises(ValidationError):
_make_config(claude_agent_max_turns=501)
def test_max_turns_accepts_boundary_values(self):
cfg_low = _make_config(claude_agent_max_turns=1)
assert cfg_low.claude_agent_max_turns == 1
cfg_high = _make_config(claude_agent_max_turns=500)
assert cfg_high.claude_agent_max_turns == 500
def test_max_budget_rejects_zero(self):
with pytest.raises(ValidationError):
_make_config(claude_agent_max_budget_usd=0.0)
def test_max_budget_rejects_negative(self):
with pytest.raises(ValidationError):
_make_config(claude_agent_max_budget_usd=-1.0)
def test_max_budget_rejects_above_100(self):
with pytest.raises(ValidationError):
_make_config(claude_agent_max_budget_usd=100.01)
def test_max_budget_accepts_boundary_values(self):
cfg_low = _make_config(claude_agent_max_budget_usd=0.01)
assert cfg_low.claude_agent_max_budget_usd == 0.01
cfg_high = _make_config(claude_agent_max_budget_usd=100.0)
assert cfg_high.claude_agent_max_budget_usd == 100.0

View File

@@ -2027,20 +2027,20 @@ async def stream_chat_completion_sdk(
# Transient retry helper — deduplicates the logic shared between
# _HandledStreamError and the generic except-Exception handler.
transient_retries = 0
max_transient = config.claude_agent_max_transient_retries
max_transient_retries = config.claude_agent_max_transient_retries
def _can_retry_transient() -> int | None:
"""Check if a transient retry is possible.
def _next_transient_backoff() -> int | None:
"""Return the next backoff delay in seconds, or ``None`` to surface the error.
Returns the backoff seconds if a retry should be attempted,
or ``None`` if the error should be surfaced to the user.
Mutates outer ``transient_retries`` via nonlocal.
or ``None`` if retries are exhausted or events were already
yielded. Mutates outer ``transient_retries`` via nonlocal.
"""
nonlocal transient_retries
if events_yielded > 0:
return None
transient_retries += 1
if transient_retries > max_transient:
if transient_retries > max_transient_retries:
return None
return 2 ** (transient_retries - 1) # 1s, 2s, 4s, ...
@@ -2171,14 +2171,14 @@ async def stream_chat_completion_sdk(
if exc.code == "transient_api_error" or is_transient_api_error(
str(exc)
):
backoff = _can_retry_transient()
backoff = _next_transient_backoff()
if backoff is not None:
logger.warning(
"%s Transient error — retrying in %ds (%d/%d)",
log_prefix,
backoff,
transient_retries,
max_transient,
max_transient_retries,
)
yield StreamStatus(
message=f"Connection interrupted, retrying in {backoff}s…"
@@ -2247,14 +2247,14 @@ async def stream_chat_completion_sdk(
# Transient API errors (ECONNRESET, 429, 5xx) — retry
# with exponential backoff via the shared helper.
if is_transient:
backoff = _can_retry_transient()
backoff = _next_transient_backoff()
if backoff is not None:
logger.warning(
"%s Transient exception — retrying in %ds (%d/%d)",
log_prefix,
backoff,
transient_retries,
max_transient,
max_transient_retries,
)
yield StreamStatus(
message=f"Connection interrupted, retrying "