mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(platform): reduce Sentry noise by filtering expected errors and downgrading log levels
- Add `before_send` hook to Sentry to filter known expected errors: AMQP/RabbitMQ connection errors, user credential errors (invalid API keys), insufficient balance, blocked IP access, Discord token errors, Google metadata DNS errors, inactive email recipients - Downgrade connection retry failures from error to warning in conn_retry - Stop sending user-caused ValueError exceptions to Sentry in block execution - Downgrade scheduler job failures and graph validation errors to warning - Wrap discord_system_alert to catch and warn instead of raising - Downgrade all notification system errors to warning (batch processing, email sending, user lookup, preference handling) - Downgrade cloud storage cleanup errors to warning - Downgrade executor shutdown/disconnect errors to warning
This commit is contained in:
@@ -375,9 +375,12 @@ async def execute_node(
|
||||
log_metadata.debug("Node produced output", **{output_name: output_data})
|
||||
yield output_name, output_data
|
||||
except Exception as ex:
|
||||
# Capture exception WITH context still set before restoring scope
|
||||
sentry_sdk.capture_exception(error=ex, scope=scope)
|
||||
sentry_sdk.flush() # Ensure it's sent before we restore scope
|
||||
# Only capture unexpected errors to Sentry, not user-caused ones.
|
||||
# ValueError subclasses (BlockExecutionError, BlockInputError,
|
||||
# InsufficientBalanceError, etc.) are expected user/validation errors.
|
||||
if not isinstance(ex, ValueError):
|
||||
sentry_sdk.capture_exception(error=ex, scope=scope)
|
||||
sentry_sdk.flush()
|
||||
# Re-raise to maintain normal error flow
|
||||
raise
|
||||
finally:
|
||||
@@ -1478,7 +1481,7 @@ class ExecutionProcessor:
|
||||
alert_message, DiscordChannel.PRODUCT
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send low balance Discord alert: {e}")
|
||||
logger.warning(f"Failed to send low balance Discord alert: {e}")
|
||||
|
||||
|
||||
class ExecutionManager(AppProcess):
|
||||
@@ -1903,14 +1906,14 @@ class ExecutionManager(AppProcess):
|
||||
try:
|
||||
thread.join(timeout=300)
|
||||
except TimeoutError:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"{prefix} ⚠️ Run thread did not finish in time, forcing disconnect"
|
||||
)
|
||||
|
||||
client.disconnect()
|
||||
logger.info(f"{prefix} ✅ Run client disconnected")
|
||||
except Exception as e:
|
||||
logger.error(f"{prefix} ⚠️ Error disconnecting run client: {type(e)} {e}")
|
||||
logger.warning(f"{prefix} ⚠️ Error disconnecting run client: {type(e)} {e}")
|
||||
|
||||
def cleanup(self):
|
||||
"""Override cleanup to implement graceful shutdown with active execution waiting."""
|
||||
@@ -1926,7 +1929,9 @@ class ExecutionManager(AppProcess):
|
||||
)
|
||||
logger.info(f"{prefix} ✅ Exec consumer has been signaled to stop")
|
||||
except Exception as e:
|
||||
logger.error(f"{prefix} ⚠️ Error signaling consumer to stop: {type(e)} {e}")
|
||||
logger.warning(
|
||||
f"{prefix} ⚠️ Error signaling consumer to stop: {type(e)} {e}"
|
||||
)
|
||||
|
||||
# Wait for active executions to complete
|
||||
if self.active_graph_runs:
|
||||
@@ -1957,7 +1962,7 @@ class ExecutionManager(AppProcess):
|
||||
waited += wait_interval
|
||||
|
||||
if self.active_graph_runs:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"{prefix} ⚠️ {len(self.active_graph_runs)} executions still running after {max_wait}s"
|
||||
)
|
||||
else:
|
||||
@@ -1968,7 +1973,7 @@ class ExecutionManager(AppProcess):
|
||||
self.executor.shutdown(cancel_futures=True, wait=False)
|
||||
logger.info(f"{prefix} ✅ Executor shutdown completed")
|
||||
except Exception as e:
|
||||
logger.error(f"{prefix} ⚠️ Error during executor shutdown: {type(e)} {e}")
|
||||
logger.warning(f"{prefix} ⚠️ Error during executor shutdown: {type(e)} {e}")
|
||||
|
||||
# Release remaining execution locks
|
||||
try:
|
||||
|
||||
@@ -94,7 +94,7 @@ SCHEDULER_OPERATION_TIMEOUT_SECONDS = 300 # 5 minutes for scheduler operations
|
||||
def job_listener(event):
|
||||
"""Logs job execution outcomes for better monitoring."""
|
||||
if event.exception:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Job {event.job_id} failed: {type(event.exception).__name__}: {event.exception}"
|
||||
)
|
||||
else:
|
||||
@@ -137,7 +137,7 @@ def run_async(coro, timeout: float = SCHEDULER_OPERATION_TIMEOUT_SECONDS):
|
||||
try:
|
||||
return future.result(timeout=timeout)
|
||||
except Exception as e:
|
||||
logger.error(f"Async operation failed: {type(e).__name__}: {e}")
|
||||
logger.warning(f"Async operation failed: {type(e).__name__}: {e}")
|
||||
raise
|
||||
|
||||
|
||||
@@ -186,7 +186,7 @@ async def _execute_graph(**kwargs):
|
||||
|
||||
|
||||
async def _handle_graph_validation_error(args: "GraphExecutionJobArgs") -> None:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Scheduled Graph {args.graph_id} failed validation. Unscheduling graph"
|
||||
)
|
||||
if args.schedule_id:
|
||||
@@ -196,8 +196,9 @@ async def _handle_graph_validation_error(args: "GraphExecutionJobArgs") -> None:
|
||||
user_id=args.user_id,
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Unable to unschedule graph: {args.graph_id} as this is an old job with no associated schedule_id please remove manually"
|
||||
logger.warning(
|
||||
f"Unable to unschedule graph: {args.graph_id} as this is an old job "
|
||||
f"with no associated schedule_id please remove manually"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -303,9 +303,9 @@ class NotificationManager(AppService):
|
||||
)
|
||||
|
||||
if not oldest_message:
|
||||
# this should never happen
|
||||
logger.error(
|
||||
f"Batch for user {batch.user_id} and type {notification_type} has no oldest message whichshould never happen!!!!!!!!!!!!!!!!"
|
||||
logger.warning(
|
||||
f"Batch for user {batch.user_id} and type {notification_type} "
|
||||
f"has no oldest message — batch may have been cleared concurrently"
|
||||
)
|
||||
continue
|
||||
|
||||
@@ -318,7 +318,7 @@ class NotificationManager(AppService):
|
||||
).get_user_email_by_id(batch.user_id)
|
||||
|
||||
if not recipient_email:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"User email not found for user {batch.user_id}"
|
||||
)
|
||||
continue
|
||||
@@ -344,7 +344,7 @@ class NotificationManager(AppService):
|
||||
).get_user_notification_batch(batch.user_id, notification_type)
|
||||
|
||||
if not batch_data or not batch_data.notifications:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Batch data not found for user {batch.user_id}"
|
||||
)
|
||||
# Clear the batch
|
||||
@@ -372,7 +372,7 @@ class NotificationManager(AppService):
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Error parsing notification event: {e=}, {db_event=}"
|
||||
)
|
||||
continue
|
||||
@@ -415,7 +415,10 @@ class NotificationManager(AppService):
|
||||
async def discord_system_alert(
|
||||
self, content: str, channel: DiscordChannel = DiscordChannel.PLATFORM
|
||||
):
|
||||
await discord_send_alert(content, channel)
|
||||
try:
|
||||
await discord_send_alert(content, channel)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to send Discord system alert: {e}")
|
||||
|
||||
async def _queue_scheduled_notification(self, event: SummaryParamsEventModel):
|
||||
"""Queue a scheduled notification - exposed method for other services to call"""
|
||||
@@ -516,7 +519,7 @@ class NotificationManager(AppService):
|
||||
raise ValueError("Invalid event type or params")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to gather summary data: {e}")
|
||||
logger.warning(f"Failed to gather summary data: {e}")
|
||||
# Return sensible defaults in case of error
|
||||
if event_type == NotificationType.DAILY_SUMMARY and isinstance(
|
||||
params, DailySummaryParams
|
||||
@@ -562,8 +565,9 @@ class NotificationManager(AppService):
|
||||
should_retry=False
|
||||
).get_user_notification_oldest_message_in_batch(user_id, event_type)
|
||||
if not oldest_message:
|
||||
logger.error(
|
||||
f"Batch for user {user_id} and type {event_type} has no oldest message whichshould never happen!!!!!!!!!!!!!!!!"
|
||||
logger.warning(
|
||||
f"Batch for user {user_id} and type {event_type} "
|
||||
f"has no oldest message — batch may have been cleared concurrently"
|
||||
)
|
||||
return False
|
||||
oldest_age = oldest_message.created_at
|
||||
@@ -585,7 +589,7 @@ class NotificationManager(AppService):
|
||||
get_notif_data_type(event.type)
|
||||
].model_validate_json(message)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing message due to non matching schema {e}")
|
||||
logger.warning(f"Error parsing message due to non matching schema {e}")
|
||||
return None
|
||||
|
||||
async def _process_admin_message(self, message: str) -> bool:
|
||||
@@ -614,7 +618,7 @@ class NotificationManager(AppService):
|
||||
should_retry=False
|
||||
).get_user_email_by_id(event.user_id)
|
||||
if not recipient_email:
|
||||
logger.error(f"User email not found for user {event.user_id}")
|
||||
logger.warning(f"User email not found for user {event.user_id}")
|
||||
return False
|
||||
|
||||
should_send = await self._should_email_user_based_on_preference(
|
||||
@@ -651,7 +655,7 @@ class NotificationManager(AppService):
|
||||
should_retry=False
|
||||
).get_user_email_by_id(event.user_id)
|
||||
if not recipient_email:
|
||||
logger.error(f"User email not found for user {event.user_id}")
|
||||
logger.warning(f"User email not found for user {event.user_id}")
|
||||
return False
|
||||
|
||||
should_send = await self._should_email_user_based_on_preference(
|
||||
@@ -672,7 +676,7 @@ class NotificationManager(AppService):
|
||||
should_retry=False
|
||||
).get_user_notification_batch(event.user_id, event.type)
|
||||
if not batch or not batch.notifications:
|
||||
logger.error(f"Batch not found for user {event.user_id}")
|
||||
logger.warning(f"Batch not found for user {event.user_id}")
|
||||
return False
|
||||
unsub_link = generate_unsubscribe_link(event.user_id)
|
||||
|
||||
@@ -745,7 +749,7 @@ class NotificationManager(AppService):
|
||||
f"Removed {len(chunk_ids)} sent notifications from batch"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to remove sent notifications: {e}"
|
||||
)
|
||||
# Continue anyway - better to risk duplicates than lose emails
|
||||
@@ -770,7 +774,7 @@ class NotificationManager(AppService):
|
||||
else:
|
||||
# Message is too large even after size reduction
|
||||
if attempt_size == 1:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to send notification at index {i}: "
|
||||
f"Single notification exceeds email size limit "
|
||||
f"({len(test_message):,} chars > {MAX_EMAIL_SIZE:,} chars). "
|
||||
@@ -789,7 +793,7 @@ class NotificationManager(AppService):
|
||||
f"Removed oversized notification {chunk_ids[0]} from batch permanently"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to remove oversized notification: {e}"
|
||||
)
|
||||
|
||||
@@ -823,7 +827,7 @@ class NotificationManager(AppService):
|
||||
f"Set email verification to false for user {event.user_id}"
|
||||
)
|
||||
except Exception as deactivation_error:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to deactivate email for user {event.user_id}: "
|
||||
f"{deactivation_error}"
|
||||
)
|
||||
@@ -835,7 +839,7 @@ class NotificationManager(AppService):
|
||||
f"Disabled all notification preferences for user {event.user_id}"
|
||||
)
|
||||
except Exception as disable_error:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to disable notification preferences: {disable_error}"
|
||||
)
|
||||
|
||||
@@ -848,7 +852,7 @@ class NotificationManager(AppService):
|
||||
f"Cleared ALL notification batches for user {event.user_id}"
|
||||
)
|
||||
except Exception as remove_error:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to clear batches for inactive recipient: {remove_error}"
|
||||
)
|
||||
|
||||
@@ -859,7 +863,7 @@ class NotificationManager(AppService):
|
||||
"422" in error_message
|
||||
or "unprocessable" in error_message
|
||||
):
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to send notification at index {i}: "
|
||||
f"Malformed notification data rejected by Postmark. "
|
||||
f"Error: {e}. Removing from batch permanently."
|
||||
@@ -877,7 +881,7 @@ class NotificationManager(AppService):
|
||||
"Removed malformed notification from batch permanently"
|
||||
)
|
||||
except Exception as remove_error:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to remove malformed notification: {remove_error}"
|
||||
)
|
||||
# Check if it's a ValueError for size limit
|
||||
@@ -885,14 +889,14 @@ class NotificationManager(AppService):
|
||||
isinstance(e, ValueError)
|
||||
and "too large" in error_message
|
||||
):
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to send notification at index {i}: "
|
||||
f"Notification size exceeds email limit. "
|
||||
f"Error: {e}. Skipping this notification."
|
||||
)
|
||||
# Other API errors
|
||||
else:
|
||||
logger.error(
|
||||
logger.warning(
|
||||
f"Failed to send notification at index {i}: "
|
||||
f"Email API error ({error_type}): {e}. "
|
||||
f"Skipping this notification."
|
||||
@@ -907,7 +911,9 @@ class NotificationManager(AppService):
|
||||
|
||||
if not chunk_sent:
|
||||
# Should not reach here due to single notification handling
|
||||
logger.error(f"Failed to send notifications starting at index {i}")
|
||||
logger.warning(
|
||||
f"Failed to send notifications starting at index {i}"
|
||||
)
|
||||
failed_indices.append(i)
|
||||
i += 1
|
||||
|
||||
@@ -946,7 +952,7 @@ class NotificationManager(AppService):
|
||||
should_retry=False
|
||||
).get_user_email_by_id(event.user_id)
|
||||
if not recipient_email:
|
||||
logger.error(f"User email not found for user {event.user_id}")
|
||||
logger.warning(f"User email not found for user {event.user_id}")
|
||||
return False
|
||||
should_send = await self._should_email_user_based_on_preference(
|
||||
event.user_id, event.type
|
||||
@@ -1007,7 +1013,7 @@ class NotificationManager(AppService):
|
||||
# Let message.process() handle the rejection
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing message in {queue_name}: {e}")
|
||||
logger.warning(f"Error processing message in {queue_name}: {e}")
|
||||
# Let message.process() handle the rejection
|
||||
raise
|
||||
except asyncio.CancelledError:
|
||||
|
||||
@@ -613,5 +613,5 @@ async def cleanup_expired_files_async() -> int:
|
||||
)
|
||||
return deleted_count
|
||||
except Exception as e:
|
||||
logger.error(f"[CloudStorage] Error during cloud storage cleanup: {e}")
|
||||
logger.warning(f"[CloudStorage] Error during cloud storage cleanup: {e}")
|
||||
return 0
|
||||
|
||||
@@ -21,6 +21,73 @@ class DiscordChannel(str, Enum):
|
||||
PRODUCT = "product" # For product alerts (low balance, zero balance, etc.)
|
||||
|
||||
|
||||
def _before_send(event, hint):
|
||||
"""Filter out expected/transient errors from Sentry to reduce noise."""
|
||||
if "exc_info" in hint:
|
||||
exc_type, exc_value, _ = hint["exc_info"]
|
||||
exc_msg = str(exc_value).lower() if exc_value else ""
|
||||
|
||||
# AMQP/RabbitMQ transient connection errors — expected during deploys
|
||||
amqp_keywords = [
|
||||
"amqpconnection",
|
||||
"amqpconnector",
|
||||
"connection refused",
|
||||
"connection_forced",
|
||||
"channelinvalidstateerror",
|
||||
"no active transport",
|
||||
]
|
||||
if any(kw in exc_msg for kw in amqp_keywords):
|
||||
return None
|
||||
|
||||
# User-caused credential/auth errors — not platform bugs
|
||||
user_auth_keywords = [
|
||||
"incorrect api key",
|
||||
"invalid x-api-key",
|
||||
"missing authentication header",
|
||||
"invalid api token",
|
||||
"authentication_error",
|
||||
]
|
||||
if any(kw in exc_msg for kw in user_auth_keywords):
|
||||
return None
|
||||
|
||||
# Expected business logic — insufficient balance
|
||||
if "insufficient balance" in exc_msg:
|
||||
return None
|
||||
|
||||
# Expected security check — blocked IP access
|
||||
if "access to blocked or private ip" in exc_msg:
|
||||
return None
|
||||
|
||||
# Discord bot token misconfiguration — not a platform error
|
||||
if "improper token has been passed" in exc_msg or (
|
||||
exc_type and exc_type.__name__ == "Forbidden" and "50001" in exc_msg
|
||||
):
|
||||
return None
|
||||
|
||||
# Google metadata DNS errors — expected in non-GCP environments
|
||||
if "metadata.google.internal" in exc_msg:
|
||||
return None
|
||||
|
||||
# Inactive email recipients — expected for bounced addresses
|
||||
if "marked as inactive" in exc_msg and "inactive addresses" in exc_msg:
|
||||
return None
|
||||
|
||||
# Also filter log-based events for known noisy messages
|
||||
if event.get("logger") and event.get("message"):
|
||||
msg = event["message"].lower()
|
||||
noisy_patterns = [
|
||||
"amqpconnection",
|
||||
"connection refused",
|
||||
"connection_forced",
|
||||
"unclosed client session",
|
||||
"unclosed connector",
|
||||
]
|
||||
if any(p in msg for p in noisy_patterns):
|
||||
return None
|
||||
|
||||
return event
|
||||
|
||||
|
||||
def sentry_init():
|
||||
sentry_dsn = settings.secrets.sentry_dsn
|
||||
integrations = []
|
||||
@@ -35,6 +102,7 @@ def sentry_init():
|
||||
profiles_sample_rate=1.0,
|
||||
environment=f"app:{settings.config.app_env.value}-behave:{settings.config.behave_as.value}",
|
||||
_experiments={"enable_logs": True},
|
||||
before_send=_before_send,
|
||||
integrations=[
|
||||
AsyncioIntegration(),
|
||||
LoggingIntegration(sentry_logs_level=logging.INFO),
|
||||
|
||||
@@ -64,7 +64,7 @@ def send_rate_limited_discord_alert(
|
||||
return True
|
||||
|
||||
except Exception as alert_error:
|
||||
logger.error(f"Failed to send Discord alert: {alert_error}")
|
||||
logger.warning(f"Failed to send Discord alert: {alert_error}")
|
||||
return False
|
||||
|
||||
|
||||
@@ -182,7 +182,7 @@ def conn_retry(
|
||||
func_name = getattr(retry_state.fn, "__name__", "unknown")
|
||||
|
||||
if retry_state.outcome.failed and retry_state.next_action is None:
|
||||
logger.error(f"{prefix} {action_name} failed after retries: {exception}")
|
||||
logger.warning(f"{prefix} {action_name} failed after retries: {exception}")
|
||||
else:
|
||||
if attempt_number == EXCESSIVE_RETRY_THRESHOLD:
|
||||
if send_rate_limited_discord_alert(
|
||||
@@ -225,7 +225,7 @@ def conn_retry(
|
||||
logger.info(f"{prefix} {action_name} completed successfully.")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"{prefix} {action_name} failed after retries: {e}")
|
||||
logger.warning(f"{prefix} {action_name} failed after retries: {e}")
|
||||
raise
|
||||
|
||||
@wraps(func)
|
||||
@@ -237,7 +237,7 @@ def conn_retry(
|
||||
logger.info(f"{prefix} {action_name} completed successfully.")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"{prefix} {action_name} failed after retries: {e}")
|
||||
logger.warning(f"{prefix} {action_name} failed after retries: {e}")
|
||||
raise
|
||||
|
||||
return async_wrapper if is_coroutine else sync_wrapper
|
||||
|
||||
Reference in New Issue
Block a user