mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(backend): implement comprehensive rate-limited Discord alerting system (#11106)
## Summary Implement comprehensive Discord alerting system with intelligent rate limiting to prevent spam and provide proper visibility into system failures across retry mechanisms and execution errors. ## Key Features ### 🚨 Rate-Limited Discord Alerting Infrastructure - **Reusable rate-limited alerts**: `send_rate_limited_discord_alert()` function for any Discord alerts - **5-minute rate limiting**: Prevents spam for identical error signatures (function+error+context) - **Thread-safe**: Proper locking for concurrent alert attempts - **Configurable channels**: Support custom Discord channels or default to PLATFORM - **Graceful failure handling**: Alert failures don't break main application flow ### 🔄 Enhanced Retry Alert System - **Unified threshold alerting**: Both general retries and infrastructure retries alert at EXCESSIVE_RETRY_THRESHOLD (50 attempts) - **Critical retry alerts**: Early warning when operations approach failure threshold - **Infrastructure monitoring**: Dedicated alerts for database, Redis, RabbitMQ connection issues - **Rate limited**: All retry alerts use rate limiting to prevent overwhelming Discord channels ### 📊 Unknown Execution Error Alerts - **Automatic error detection**: Alert for unexpected graph execution failures - **Rich context**: Include user ID, graph ID, execution ID, error type and message - **Filtered alerts**: Skip known errors (InsufficientBalanceError, ModerationError) - **Proper error tracking**: Ensure execution_stats.error is set for all error types ## Technical Implementation ### Rate Limiting Strategy ```python # Create unique signatures based on function+error+context error_signature = f"{context}:{func_name}:{type(exception).__name__}:{str(exception)[:100]}" ``` - **5-minute windows**: ALERT_RATE_LIMIT_SECONDS = 300 prevents duplicate alerts - **Memory efficient**: Only store last alert timestamp per unique error signature - **Context awareness**: Same error in different contexts can send separate alerts ### Alerting Hierarchy 1. **50 attempts**: Critical alert warning about approaching failure (EXCESSIVE_RETRY_THRESHOLD) 2. **100 attempts**: Final infrastructure failure (conn_retry max_retry) 3. **Unknown execution errors**: Immediate rate-limited alerts for unexpected failures ## Files Modified ### Core Implementation - `backend/executor/manager.py`: Unknown execution error alerts with rate limiting - `backend/util/retry.py`: Comprehensive rate-limited alerting infrastructure - `backend/util/retry_test.py`: Full test coverage for rate limiting functionality (14 tests) ### Code Quality Improvements - **Inlined alert messages**: Eliminated unnecessary temporary variables - **Simplified logic**: Removed excessive comments and redundant alerts - **Consistent patterns**: All alert functions follow same clean code style - **DRY principle**: Reusable rate-limited alert system for future monitoring needs ## Benefits ### 🛡️ Prevents Alert Spam - **Rate limiting**: No more overwhelming Discord channels with duplicate alerts - **Intelligent deduplication**: Same errors rate limited while different errors get through - **Thread safety**: Concurrent operations handled correctly ### 🔍 Better System Visibility - **Unknown errors**: Issues that need investigation are properly surfaced - **Infrastructure monitoring**: Early warning for database/Redis/RabbitMQ issues - **Rich context**: All necessary debugging information included in alerts ### 🧹 Maintainable Codebase - **Reusable infrastructure**: `send_rate_limited_discord_alert()` for future monitoring - **Clean, consistent code**: Inlined messages, simplified logic, proper abstractions - **Comprehensive testing**: Rate limiting edge cases and real-world scenarios covered ## Validation Results - ✅ All 14 retry tests pass including comprehensive rate limiting coverage - ✅ Manager execution tests pass validating integration with execution flow - ✅ Thread safety validated with concurrent alert attempt tests - ✅ Real-world scenarios tested including the specific spend_credits spam issue that motivated this work - ✅ Code formatting, linting, and type checking all pass ## Before/After Comparison ### Before - No rate limiting → Discord spam for repeated errors - Unknown execution errors not monitored → Issues went unnoticed - Inconsistent alerting thresholds → Confusing monitoring - Verbose code with temporary variables → Harder to maintain ### After - ✅ Rate-limited intelligent alerting prevents spam - ✅ Unknown execution errors properly monitored with context - ✅ Unified 50-attempt threshold for consistent monitoring - ✅ Clean, maintainable code with reusable infrastructure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -84,7 +84,11 @@ from backend.util.file import clean_exec_files
|
||||
from backend.util.logging import TruncatedLogger, configure_logging
|
||||
from backend.util.metrics import DiscordChannel
|
||||
from backend.util.process import AppProcess, set_service_name
|
||||
from backend.util.retry import continuous_retry, func_retry
|
||||
from backend.util.retry import (
|
||||
continuous_retry,
|
||||
func_retry,
|
||||
send_rate_limited_discord_alert,
|
||||
)
|
||||
from backend.util.settings import Settings
|
||||
|
||||
from .cluster_lock import ClusterLock
|
||||
@@ -979,16 +983,31 @@ class ExecutionProcessor:
|
||||
if isinstance(e, Exception)
|
||||
else Exception(f"{e.__class__.__name__}: {e}")
|
||||
)
|
||||
if not execution_stats.error:
|
||||
execution_stats.error = str(error)
|
||||
|
||||
known_errors = (InsufficientBalanceError, ModerationError)
|
||||
if isinstance(error, known_errors):
|
||||
execution_stats.error = str(error)
|
||||
return ExecutionStatus.FAILED
|
||||
|
||||
execution_status = ExecutionStatus.FAILED
|
||||
log_metadata.exception(
|
||||
f"Failed graph execution {graph_exec.graph_exec_id}: {error}"
|
||||
)
|
||||
|
||||
# Send rate-limited Discord alert for unknown/unexpected errors
|
||||
send_rate_limited_discord_alert(
|
||||
"graph_execution",
|
||||
error,
|
||||
"unknown_error",
|
||||
f"🚨 **Unknown Graph Execution Error**\n"
|
||||
f"User: {graph_exec.user_id}\n"
|
||||
f"Graph ID: {graph_exec.graph_id}\n"
|
||||
f"Execution ID: {graph_exec.graph_exec_id}\n"
|
||||
f"Error Type: {type(error).__name__}\n"
|
||||
f"Error: {str(error)[:200]}{'...' if len(str(error)) > 200 else ''}\n",
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
finally:
|
||||
|
||||
@@ -26,7 +26,7 @@ _rate_limiter_lock = threading.Lock()
|
||||
ALERT_RATE_LIMIT_SECONDS = 300 # 5 minutes between same alerts
|
||||
|
||||
|
||||
def _should_send_alert(func_name: str, exception: Exception, context: str = "") -> bool:
|
||||
def should_send_alert(func_name: str, exception: Exception, context: str = "") -> bool:
|
||||
"""Check if we should send an alert based on rate limiting."""
|
||||
# Create a unique key for this function+error+context combination
|
||||
error_signature = (
|
||||
@@ -42,38 +42,51 @@ def _should_send_alert(func_name: str, exception: Exception, context: str = "")
|
||||
return False
|
||||
|
||||
|
||||
def send_rate_limited_discord_alert(
|
||||
func_name: str, exception: Exception, context: str, alert_msg: str, channel=None
|
||||
) -> bool:
|
||||
"""
|
||||
Send a Discord alert with rate limiting.
|
||||
|
||||
Returns True if alert was sent, False if rate limited.
|
||||
"""
|
||||
if not should_send_alert(func_name, exception, context):
|
||||
return False
|
||||
|
||||
try:
|
||||
from backend.util.clients import get_notification_manager_client
|
||||
from backend.util.metrics import DiscordChannel
|
||||
|
||||
notification_client = get_notification_manager_client()
|
||||
notification_client.discord_system_alert(
|
||||
alert_msg, channel or DiscordChannel.PLATFORM
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as alert_error:
|
||||
logger.error(f"Failed to send Discord alert: {alert_error}")
|
||||
return False
|
||||
|
||||
|
||||
def _send_critical_retry_alert(
|
||||
func_name: str, attempt_number: int, exception: Exception, context: str = ""
|
||||
):
|
||||
"""Send alert when a function is approaching the retry failure threshold."""
|
||||
|
||||
# Rate limit alerts to prevent spam
|
||||
if not _should_send_alert(func_name, exception, context):
|
||||
return # Just silently skip, no extra logging
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from backend.util.clients import get_notification_manager_client
|
||||
|
||||
notification_client = get_notification_manager_client()
|
||||
|
||||
prefix = f"{context}: " if context else ""
|
||||
alert_msg = (
|
||||
f"🚨 CRITICAL: Operation Approaching Failure Threshold: {prefix}'{func_name}'\n\n"
|
||||
f"Current attempt: {attempt_number}/{EXCESSIVE_RETRY_THRESHOLD}\n"
|
||||
f"Error: {type(exception).__name__}: {exception}\n\n"
|
||||
f"This operation is about to fail permanently. Investigate immediately."
|
||||
)
|
||||
|
||||
notification_client.discord_system_alert(alert_msg)
|
||||
prefix = f"{context}: " if context else ""
|
||||
if send_rate_limited_discord_alert(
|
||||
func_name,
|
||||
exception,
|
||||
context,
|
||||
f"🚨 CRITICAL: Operation Approaching Failure Threshold: {prefix}'{func_name}'\n\n"
|
||||
f"Current attempt: {attempt_number}/{EXCESSIVE_RETRY_THRESHOLD}\n"
|
||||
f"Error: {type(exception).__name__}: {exception}\n\n"
|
||||
f"This operation is about to fail permanently. Investigate immediately.",
|
||||
):
|
||||
logger.critical(
|
||||
f"CRITICAL ALERT SENT: Operation {func_name} at attempt {attempt_number}"
|
||||
)
|
||||
|
||||
except Exception as alert_error:
|
||||
logger.error(f"Failed to send critical retry alert: {alert_error}")
|
||||
# Don't let alerting failures break the main flow
|
||||
|
||||
|
||||
def _create_retry_callback(context: str = ""):
|
||||
"""Create a retry callback with optional context."""
|
||||
@@ -157,7 +170,7 @@ def _log_prefix(resource_name: str, conn_id: str):
|
||||
def conn_retry(
|
||||
resource_name: str,
|
||||
action_name: str,
|
||||
max_retry: int = 5,
|
||||
max_retry: int = 100,
|
||||
max_wait: float = 30,
|
||||
):
|
||||
conn_id = str(uuid4())
|
||||
@@ -165,10 +178,29 @@ def conn_retry(
|
||||
def on_retry(retry_state):
|
||||
prefix = _log_prefix(resource_name, conn_id)
|
||||
exception = retry_state.outcome.exception()
|
||||
attempt_number = retry_state.attempt_number
|
||||
func_name = getattr(retry_state.fn, "__name__", "unknown")
|
||||
|
||||
if retry_state.outcome.failed and retry_state.next_action is None:
|
||||
logger.error(f"{prefix} {action_name} failed after retries: {exception}")
|
||||
else:
|
||||
if attempt_number == EXCESSIVE_RETRY_THRESHOLD:
|
||||
if send_rate_limited_discord_alert(
|
||||
func_name,
|
||||
exception,
|
||||
f"{resource_name}_infrastructure",
|
||||
f"🚨 **Critical Infrastructure Connection Issue**\n"
|
||||
f"Resource: {resource_name}\n"
|
||||
f"Action: {action_name}\n"
|
||||
f"Function: {func_name}\n"
|
||||
f"Current attempt: {attempt_number}/{max_retry + 1}\n"
|
||||
f"Error: {type(exception).__name__}: {str(exception)[:200]}{'...' if len(str(exception)) > 200 else ''}\n\n"
|
||||
f"Infrastructure component is approaching failure threshold. Investigate immediately.",
|
||||
):
|
||||
logger.critical(
|
||||
f"INFRASTRUCTURE ALERT SENT: {resource_name} at {attempt_number} attempts"
|
||||
)
|
||||
|
||||
logger.warning(
|
||||
f"{prefix} {action_name} failed: {exception}. Retrying now..."
|
||||
)
|
||||
@@ -244,8 +276,8 @@ def continuous_retry(*, retry_delay: float = 1.0):
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
counter = 0
|
||||
while True:
|
||||
counter = 0
|
||||
try:
|
||||
return await func(*args, **kwargs)
|
||||
except Exception as exc:
|
||||
|
||||
@@ -10,9 +10,9 @@ from backend.util.retry import (
|
||||
_alert_rate_limiter,
|
||||
_rate_limiter_lock,
|
||||
_send_critical_retry_alert,
|
||||
_should_send_alert,
|
||||
conn_retry,
|
||||
create_retry_decorator,
|
||||
should_send_alert,
|
||||
)
|
||||
|
||||
|
||||
@@ -71,17 +71,17 @@ class TestRetryRateLimiting:
|
||||
def test_should_send_alert_allows_first_occurrence(self):
|
||||
"""Test that the first occurrence of an error allows alert."""
|
||||
exc = ValueError("test error")
|
||||
assert _should_send_alert("test_func", exc, "test_context") is True
|
||||
assert should_send_alert("test_func", exc, "test_context") is True
|
||||
|
||||
def test_should_send_alert_rate_limits_duplicate(self):
|
||||
"""Test that duplicate errors are rate limited."""
|
||||
exc = ValueError("test error")
|
||||
|
||||
# First call should be allowed
|
||||
assert _should_send_alert("test_func", exc, "test_context") is True
|
||||
assert should_send_alert("test_func", exc, "test_context") is True
|
||||
|
||||
# Second call should be rate limited
|
||||
assert _should_send_alert("test_func", exc, "test_context") is False
|
||||
assert should_send_alert("test_func", exc, "test_context") is False
|
||||
|
||||
def test_should_send_alert_allows_different_errors(self):
|
||||
"""Test that different errors are allowed even if same function."""
|
||||
@@ -89,47 +89,47 @@ class TestRetryRateLimiting:
|
||||
exc2 = ValueError("error 2")
|
||||
|
||||
# First error should be allowed
|
||||
assert _should_send_alert("test_func", exc1, "test_context") is True
|
||||
assert should_send_alert("test_func", exc1, "test_context") is True
|
||||
|
||||
# Different error should also be allowed
|
||||
assert _should_send_alert("test_func", exc2, "test_context") is True
|
||||
assert should_send_alert("test_func", exc2, "test_context") is True
|
||||
|
||||
def test_should_send_alert_allows_different_contexts(self):
|
||||
"""Test that same error in different contexts is allowed."""
|
||||
exc = ValueError("test error")
|
||||
|
||||
# First context should be allowed
|
||||
assert _should_send_alert("test_func", exc, "context1") is True
|
||||
assert should_send_alert("test_func", exc, "context1") is True
|
||||
|
||||
# Different context should also be allowed
|
||||
assert _should_send_alert("test_func", exc, "context2") is True
|
||||
assert should_send_alert("test_func", exc, "context2") is True
|
||||
|
||||
def test_should_send_alert_allows_different_functions(self):
|
||||
"""Test that same error in different functions is allowed."""
|
||||
exc = ValueError("test error")
|
||||
|
||||
# First function should be allowed
|
||||
assert _should_send_alert("func1", exc, "test_context") is True
|
||||
assert should_send_alert("func1", exc, "test_context") is True
|
||||
|
||||
# Different function should also be allowed
|
||||
assert _should_send_alert("func2", exc, "test_context") is True
|
||||
assert should_send_alert("func2", exc, "test_context") is True
|
||||
|
||||
def test_should_send_alert_respects_time_window(self):
|
||||
"""Test that alerts are allowed again after the rate limit window."""
|
||||
exc = ValueError("test error")
|
||||
|
||||
# First call should be allowed
|
||||
assert _should_send_alert("test_func", exc, "test_context") is True
|
||||
assert should_send_alert("test_func", exc, "test_context") is True
|
||||
|
||||
# Immediately after should be rate limited
|
||||
assert _should_send_alert("test_func", exc, "test_context") is False
|
||||
assert should_send_alert("test_func", exc, "test_context") is False
|
||||
|
||||
# Mock time to simulate passage of rate limit window
|
||||
current_time = time.time()
|
||||
with patch("backend.util.retry.time.time") as mock_time:
|
||||
# Simulate time passing beyond rate limit window
|
||||
mock_time.return_value = current_time + ALERT_RATE_LIMIT_SECONDS + 1
|
||||
assert _should_send_alert("test_func", exc, "test_context") is True
|
||||
assert should_send_alert("test_func", exc, "test_context") is True
|
||||
|
||||
def test_should_send_alert_thread_safety(self):
|
||||
"""Test that rate limiting is thread-safe."""
|
||||
@@ -137,7 +137,7 @@ class TestRetryRateLimiting:
|
||||
results = []
|
||||
|
||||
def check_alert():
|
||||
result = _should_send_alert("test_func", exc, "test_context")
|
||||
result = should_send_alert("test_func", exc, "test_context")
|
||||
results.append(result)
|
||||
|
||||
# Create multiple threads trying to send the same alert
|
||||
@@ -192,7 +192,7 @@ class TestRetryRateLimiting:
|
||||
_send_critical_retry_alert("test_func", 50, exc, "test_context")
|
||||
|
||||
# Rate limiter should still work for subsequent calls
|
||||
assert _should_send_alert("test_func", exc, "test_context") is False
|
||||
assert should_send_alert("test_func", exc, "test_context") is False
|
||||
|
||||
def test_error_signature_generation(self):
|
||||
"""Test that error signatures are generated correctly for rate limiting."""
|
||||
@@ -201,8 +201,8 @@ class TestRetryRateLimiting:
|
||||
exc = ValueError(long_message)
|
||||
|
||||
# Should not raise exception and should work normally
|
||||
assert _should_send_alert("test_func", exc, "test_context") is True
|
||||
assert _should_send_alert("test_func", exc, "test_context") is False
|
||||
assert should_send_alert("test_func", exc, "test_context") is True
|
||||
assert should_send_alert("test_func", exc, "test_context") is False
|
||||
|
||||
def test_real_world_scenario_spend_credits_spam(self):
|
||||
"""Test the real-world scenario that was causing spam."""
|
||||
|
||||
Reference in New Issue
Block a user