Merge branch 'dev' into fix/sentry-performance-integrations

feat(backend): Add Sentry FastAPI and HTTPX integrations for better performance tracing
Adds FastApiIntegration and HttpxIntegration to Sentry SDK initialization to enable: - Detailed span tracking for FastAPI request handling - Automatic tracing of outgoing HTTP calls (OpenAI, external APIs, etc.) This improves visibility in Sentry Performance for debugging slow requests and identifying bottlenecks in external API calls.
2026-02-05 04:15:08 -05:00 · 2026-02-04 21:32:12 -06:00 · 2026-02-04 22:47:35 +00:00
9 changed files with 34 additions and 268 deletions
--- a/autogpt_platform/backend/backend/executor/manager.py
+++ b/autogpt_platform/backend/backend/executor/manager.py
@@ -1385,23 +1385,8 @@ class ExecutionProcessor:
                f"[View User Details]({base_url}/admin/spending?search={user_email})"
            )

-            # Send both Discord and AllQuiet alerts
-            correlation_id = f"insufficient_funds_{user_id}"
-
-            get_notification_manager_client().system_alert(
-                content=alert_message,
-                channel=DiscordChannel.PRODUCT,
-                correlation_id=correlation_id,
-                severity="critical",
-                status="open",
-                extra_attributes={
-                    "user_id": user_id,
-                    "user_email": user_email or "unknown",
-                    "balance": f"${e.balance / 100:.2f}",
-                    "attempted_cost": f"${abs(e.amount) / 100:.2f}",
-                    "shortfall": f"${abs(shortfall) / 100:.2f}",
-                    "agent_name": metadata.name if metadata else "Unknown",
-                },
+            get_notification_manager_client().discord_system_alert(
+                alert_message, DiscordChannel.PRODUCT
            )
        except Exception as alert_error:
            logger.error(
@@ -1448,22 +1433,8 @@ class ExecutionProcessor:
                    f"Transaction cost: ${transaction_cost / 100:.2f}\n"
                    f"[View User Details]({base_url}/admin/spending?search={user_email})"
                )
-                # Send both Discord and AllQuiet alerts
-                correlation_id = f"low_balance_{user_id}"
-
-                get_notification_manager_client().system_alert(
-                    content=alert_message,
-                    channel=DiscordChannel.PRODUCT,
-                    correlation_id=correlation_id,
-                    severity="warning",
-                    status="open",
-                    extra_attributes={
-                        "user_id": user_id,
-                        "user_email": user_email or "unknown",
-                        "current_balance": f"${current_balance / 100:.2f}",
-                        "transaction_cost": f"${transaction_cost / 100:.2f}",
-                        "threshold": f"${LOW_BALANCE_THRESHOLD / 100:.2f}",
-                    },
+                get_notification_manager_client().discord_system_alert(
+                    alert_message, DiscordChannel.PRODUCT
                )
            except Exception as e:
                logger.error(f"Failed to send low balance Discord alert: {e}")
--- a/autogpt_platform/backend/backend/executor/manager_low_balance_test.py
+++ b/autogpt_platform/backend/backend/executor/manager_low_balance_test.py
@@ -54,16 +54,13 @@ async def test_handle_low_balance_threshold_crossing(server: SpinTestServer):
        assert isinstance(notification_call.data, LowBalanceData)
        assert notification_call.data.current_balance == current_balance

-        # Verify system alert was sent (includes Discord and AllQuiet)
-        mock_client.system_alert.assert_called_once()
-        alert_call = mock_client.system_alert.call_args
-        discord_message = alert_call[1]["content"]
+        # Verify Discord alert was sent
+        mock_client.discord_system_alert.assert_called_once()
+        discord_message = mock_client.discord_system_alert.call_args[0][0]
        assert "Low Balance Alert" in discord_message
        assert "test@example.com" in discord_message
        assert "$4.00" in discord_message
        assert "$6.00" in discord_message
-        # Verify AllQuiet correlation ID is set
-        assert alert_call[1]["correlation_id"] == f"low_balance_{user_id}"


@pytest.mark.asyncio(loop_scope="session")
@@ -106,7 +103,7 @@ async def test_handle_low_balance_no_notification_when_not_crossing(

        # Verify no notification was sent
        mock_queue_notif.assert_not_called()
-        mock_client.system_alert.assert_not_called()
+        mock_client.discord_system_alert.assert_not_called()


@pytest.mark.asyncio(loop_scope="session")
@@ -149,4 +146,4 @@ async def test_handle_low_balance_no_duplicate_when_already_below(

        # Verify no notification was sent (user was already below threshold)
        mock_queue_notif.assert_not_called()
-        mock_client.system_alert.assert_not_called()
+        mock_client.discord_system_alert.assert_not_called()
--- a/autogpt_platform/backend/backend/monitoring/block_error_monitor.py
+++ b/autogpt_platform/backend/backend/monitoring/block_error_monitor.py
@@ -12,7 +12,7 @@ from backend.util.clients import (
    get_database_manager_client,
    get_notification_manager_client,
 )
-from backend.util.metrics import DiscordChannel, sentry_capture_error
+from backend.util.metrics import sentry_capture_error
 from backend.util.settings import Config

 logger = logging.getLogger(__name__)
@@ -75,32 +75,7 @@ class BlockErrorMonitor:

            if critical_alerts:
                msg = "Block Error Rate Alert:\n\n" + "\n\n".join(critical_alerts)
-
-                # Send alert with correlation ID for block errors
-                # We'll create a simple hash of the block IDs that have errors
-                blocks_with_errors = [
-                    stats.block_id
-                    for name, stats in block_stats.items()
-                    if stats.total_executions >= 10
-                    and stats.error_rate >= threshold * 100
-                ]
-                correlation_id = (
-                    f"block_errors_{len(blocks_with_errors)}_blocks_{end_time.date()}"
-                )
-
-                self.notification_client.system_alert(
-                    content=msg,
-                    channel=DiscordChannel.PLATFORM,
-                    correlation_id=correlation_id,
-                    severity="warning",
-                    status="open",
-                    extra_attributes={
-                        "blocks_affected": str(len(critical_alerts)),
-                        "date": end_time.date().isoformat(),
-                        "threshold": f"{threshold * 100}%",
-                    },
-                )
-
+                self.notification_client.discord_system_alert(msg)
                logger.info(
                    f"Sent block error rate alert for {len(critical_alerts)} blocks"
                )
@@ -112,22 +87,7 @@ class BlockErrorMonitor:
                    block_stats, start_time, end_time
                )
                if top_blocks_msg:
-                    # Daily summary gets a date-based correlation ID
-                    correlation_id = f"block_error_daily_summary_{end_time.date()}"
-
-                    self.notification_client.system_alert(
-                        content=top_blocks_msg,
-                        channel=DiscordChannel.PLATFORM,
-                        correlation_id=correlation_id,
-                        severity="minor",
-                        status="open",
-                        extra_attributes={
-                            "type": "daily_summary",
-                            "date": end_time.date().isoformat(),
-                            "top_blocks_count": str(self.include_top_blocks),
-                        },
-                    )
-
+                    self.notification_client.discord_system_alert(top_blocks_msg)
                    logger.info("Sent top blocks summary")
                    return "Sent top blocks summary"

@@ -140,22 +100,7 @@ class BlockErrorMonitor:
            error = Exception(f"Error checking block error rates: {e}")
            msg = str(error)
            sentry_capture_error(error)
-
-            # Send error alert with generic correlation ID
-            correlation_id = "block_error_monitoring_failure"
-
-            self.notification_client.system_alert(
-                content=msg,
-                channel=DiscordChannel.PLATFORM,
-                correlation_id=correlation_id,
-                severity="critical",
-                status="open",
-                extra_attributes={
-                    "error_type": type(e).__name__,
-                    "error_message": str(e)[:200],
-                },
-            )
-
+            self.notification_client.discord_system_alert(msg)
            return msg

    def _get_block_stats_from_db(
--- a/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py
+++ b/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py
@@ -8,7 +8,7 @@ from backend.util.clients import (
    get_database_manager_client,
    get_notification_manager_client,
 )
-from backend.util.metrics import DiscordChannel, sentry_capture_error
+from backend.util.metrics import sentry_capture_error
 from backend.util.settings import Config

 logger = logging.getLogger(__name__)
@@ -102,28 +102,7 @@ class LateExecutionMonitor:
        msg = str(error)

        sentry_capture_error(error)
-
-        # Generate correlation ID based on the threshold and number of late executions
-        correlation_id = f"late_execution_{self.config.execution_late_notification_threshold_secs}s_{num_total_late}_execs_{num_users}_users"
-
-        # Send both Discord and AllQuiet alerts
-        self.notification_client.system_alert(
-            content=msg,
-            channel=DiscordChannel.PLATFORM,
-            correlation_id=correlation_id,
-            severity="critical",
-            status="open",
-            extra_attributes={
-                "total_late_executions": str(num_total_late),
-                "queued_executions": str(num_queued),
-                "running_executions": str(num_running),
-                "affected_users": str(num_users),
-                "threshold_seconds": str(
-                    self.config.execution_late_notification_threshold_secs
-                ),
-            },
-        )
-
+        self.notification_client.discord_system_alert(msg)
        return msg


--- a/autogpt_platform/backend/backend/notifications/notifications.py
+++ b/autogpt_platform/backend/backend/notifications/notifications.py
@@ -1,7 +1,7 @@
 import asyncio
 import logging
 from datetime import datetime, timedelta, timezone
-from typing import Awaitable, Callable, Literal
+from typing import Awaitable, Callable

 import aio_pika
 from prisma.enums import NotificationType
@@ -33,12 +33,7 @@ from backend.data.user import (
 from backend.notifications.email import EmailSender
 from backend.util.clients import get_database_manager_async_client
 from backend.util.logging import TruncatedLogger
-from backend.util.metrics import (
-    AllQuietAlert,
-    DiscordChannel,
-    discord_send_alert,
-    send_allquiet_alert,
-)
+from backend.util.metrics import DiscordChannel, discord_send_alert
 from backend.util.retry import continuous_retry
 from backend.util.service import (
    AppService,
@@ -422,54 +417,6 @@ class NotificationManager(AppService):
    ):
        await discord_send_alert(content, channel)

-    @expose
-    async def allquiet_system_alert(self, alert: AllQuietAlert):
-        await send_allquiet_alert(alert)
-
-    @expose
-    async def system_alert(
-        self,
-        content: str,
-        channel: DiscordChannel = DiscordChannel.PLATFORM,
-        correlation_id: str | None = None,
-        severity: Literal["warning", "critical", "minor"] = "warning",
-        status: Literal["resolved", "open"] = "open",
-        extra_attributes: dict[str, str] | None = None,
-    ):
-        """Send both Discord and AllQuiet alerts for system events."""
-        # Send Discord alert
-        await discord_send_alert(content, channel)
-
-        # Send AllQuiet alert if correlation_id is provided
-        if correlation_id:
-            # Extract title from content (first line or first sentence)
-            lines = content.split("\n")
-            title = lines[0] if lines else content[:100]
-            # Remove Discord formatting from title
-            title = (
-                title.replace("**", "")
-                .replace("🚨", "")
-                .replace("⚠️", "")
-                .replace("❌", "")
-                .replace("✅", "")
-                .replace("📊", "")
-                .strip()
-            )
-
-            alert = AllQuietAlert(
-                severity=severity,
-                status=status,
-                title=title[:100],  # Limit title length
-                description=content,
-                correlation_id=correlation_id,
-                channel=channel.value,  # Pass channel as first-class field
-                extra_attributes=extra_attributes or {},
-            )
-            try:
-                await send_allquiet_alert(alert)
-            except Exception as e:
-                logger.error(f"Failed to send AllQuiet alert: {e}")
-
    async def _queue_scheduled_notification(self, event: SummaryParamsEventModel):
        """Queue a scheduled notification - exposed method for other services to call"""
        try:
@@ -1159,5 +1106,3 @@ class NotificationManagerClient(AppServiceClient):
    )
    queue_weekly_summary = endpoint_to_sync(NotificationManager.queue_weekly_summary)
    discord_system_alert = endpoint_to_sync(NotificationManager.discord_system_alert)
-    allquiet_system_alert = endpoint_to_sync(NotificationManager.allquiet_system_alert)
-    system_alert = endpoint_to_sync(NotificationManager.system_alert)
--- a/autogpt_platform/backend/backend/util/metrics.py
+++ b/autogpt_platform/backend/backend/util/metrics.py
@@ -1,12 +1,13 @@
 import logging
 from enum import Enum
-from typing import Literal

 import sentry_sdk
-from pydantic import BaseModel, Field, SecretStr
+from pydantic import SecretStr
 from sentry_sdk.integrations import DidNotEnable
 from sentry_sdk.integrations.anthropic import AnthropicIntegration
 from sentry_sdk.integrations.asyncio import AsyncioIntegration
+from sentry_sdk.integrations.fastapi import FastApiIntegration
+from sentry_sdk.integrations.httpx import HttpxIntegration
 from sentry_sdk.integrations.launchdarkly import LaunchDarklyIntegration
 from sentry_sdk.integrations.logging import LoggingIntegration

@@ -38,6 +39,8 @@ def sentry_init():
        _experiments={"enable_logs": True},
        integrations=[
            AsyncioIntegration(),
+            FastApiIntegration(),  # Traces FastAPI requests with detailed spans
+            HttpxIntegration(),  # Traces outgoing HTTP calls (OpenAI, external APIs)
            LoggingIntegration(sentry_logs_level=logging.INFO),
            AnthropicIntegration(
                include_prompts=False,
@@ -52,31 +55,6 @@ def sentry_capture_error(error: BaseException):
    sentry_sdk.flush()


-class AllQuietAlert(BaseModel):
-    severity: Literal["warning"] | Literal["critical"] | Literal["minor"]
-    status: Literal["resolved"] | Literal["open"]
-    title: str | None = None
-    description: str | None = None
-    correlation_id: str | None = None
-    channel: str | None = None  # Discord channel (platform or product)
-    extra_attributes: dict[str, str] = Field(default_factory=dict)
-    environment: str = (
-        f"app:{settings.config.app_env.value}-behave:{settings.config.behave_as.value}"
-    )
-
-
-async def send_allquiet_alert(alert: AllQuietAlert):
-    hook_url = settings.secrets.allquiet_webhook_url
-
-    if not hook_url:
-        logging.warning("AllQuiet webhook URL not configured")
-        return
-
-    from backend.util.request import Requests
-
-    await Requests().post(hook_url, json=alert.model_dump())
-
-
 async def discord_send_alert(
    content: str, channel: DiscordChannel = DiscordChannel.PLATFORM
 ):
--- a/autogpt_platform/backend/backend/util/retry.py
+++ b/autogpt_platform/backend/backend/util/retry.py
@@ -4,7 +4,6 @@ import os
 import threading
 import time
 from functools import wraps
-from typing import Literal
 from uuid import uuid4

 from tenacity import (
@@ -44,14 +43,7 @@ def should_send_alert(func_name: str, exception: Exception, context: str = "") -


 def send_rate_limited_discord_alert(
-    func_name: str,
-    exception: Exception,
-    context: str,
-    alert_msg: str,
-    channel=None,
-    correlation_id: str | None = None,
-    severity: Literal["warning", "critical", "minor"] = "critical",
-    extra_attributes: dict | None = None,
+    func_name: str, exception: Exception, context: str, alert_msg: str, channel=None
 ) -> bool:
    """
    Send a Discord alert with rate limiting.
@@ -66,13 +58,8 @@ def send_rate_limited_discord_alert(
        from backend.util.metrics import DiscordChannel

        notification_client = get_notification_manager_client()
-        notification_client.system_alert(
-            content=alert_msg,
-            channel=channel or DiscordChannel.PLATFORM,
-            correlation_id=correlation_id,
-            severity=severity,
-            status="open",
-            extra_attributes=extra_attributes or {},
+        notification_client.discord_system_alert(
+            alert_msg, channel or DiscordChannel.PLATFORM
        )
        return True

@@ -87,30 +74,14 @@ def _send_critical_retry_alert(
    """Send alert when a function is approaching the retry failure threshold."""

    prefix = f"{context}: " if context else ""
-    error_type = type(exception).__name__
-
-    # Create correlation ID based on context, function name, and error type
-    correlation_id = f"retry_failure_{context}_{func_name}_{error_type}".replace(
-        " ", "_"
-    ).replace(":", "")
-
    if send_rate_limited_discord_alert(
        func_name,
        exception,
        context,
        f"🚨 CRITICAL: Operation Approaching Failure Threshold: {prefix}'{func_name}'\n\n"
        f"Current attempt: {attempt_number}/{EXCESSIVE_RETRY_THRESHOLD}\n"
-        f"Error: {error_type}: {exception}\n\n"
+        f"Error: {type(exception).__name__}: {exception}\n\n"
        f"This operation is about to fail permanently. Investigate immediately.",
-        correlation_id=correlation_id,
-        severity="critical",
-        extra_attributes={
-            "function_name": func_name,
-            "attempt_number": str(attempt_number),
-            "max_attempts": str(EXCESSIVE_RETRY_THRESHOLD),
-            "error_type": error_type,
-            "context": context or "none",
-        },
    ):
        logger.critical(
            f"CRITICAL ALERT SENT: Operation {func_name} at attempt {attempt_number}"
@@ -214,14 +185,6 @@ def conn_retry(
            logger.error(f"{prefix} {action_name} failed after retries: {exception}")
        else:
            if attempt_number == EXCESSIVE_RETRY_THRESHOLD:
-                error_type = type(exception).__name__
-                # Create correlation ID for infrastructure issues
-                correlation_id = (
-                    f"infrastructure_{resource_name}_{action_name}_{func_name}".replace(
-                        " ", "_"
-                    )
-                )
-
                if send_rate_limited_discord_alert(
                    func_name,
                    exception,
@@ -231,18 +194,8 @@ def conn_retry(
                    f"Action: {action_name}\n"
                    f"Function: {func_name}\n"
                    f"Current attempt: {attempt_number}/{max_retry + 1}\n"
-                    f"Error: {error_type}: {str(exception)[:200]}{'...' if len(str(exception)) > 200 else ''}\n\n"
+                    f"Error: {type(exception).__name__}: {str(exception)[:200]}{'...' if len(str(exception)) > 200 else ''}\n\n"
                    f"Infrastructure component is approaching failure threshold. Investigate immediately.",
-                    correlation_id=correlation_id,
-                    severity="critical",
-                    extra_attributes={
-                        "resource_name": resource_name,
-                        "action_name": action_name,
-                        "function_name": func_name,
-                        "attempt_number": str(attempt_number),
-                        "max_attempts": str(max_retry + 1),
-                        "error_type": error_type,
-                    },
                ):
                    logger.critical(
                        f"INFRASTRUCTURE ALERT SENT: {resource_name} at {attempt_number} attempts"
--- a/autogpt_platform/backend/backend/util/retry_test.py
+++ b/autogpt_platform/backend/backend/util/retry_test.py
@@ -166,16 +166,16 @@ class TestRetryRateLimiting:

        # First alert should be sent
        _send_critical_retry_alert("spend_credits", 50, exc, "Service communication")
-        assert mock_client.system_alert.call_count == 1
+        assert mock_client.discord_system_alert.call_count == 1

        # Second identical alert should be rate limited (not sent)
        _send_critical_retry_alert("spend_credits", 50, exc, "Service communication")
-        assert mock_client.system_alert.call_count == 1  # Still 1, not 2
+        assert mock_client.discord_system_alert.call_count == 1  # Still 1, not 2

        # Different error should be allowed
        exc2 = ValueError("different API error")
        _send_critical_retry_alert("spend_credits", 50, exc2, "Service communication")
-        assert mock_client.system_alert.call_count == 2
+        assert mock_client.discord_system_alert.call_count == 2

    @patch("backend.util.clients.get_notification_manager_client")
    def test_send_critical_retry_alert_handles_notification_failure(
@@ -183,7 +183,7 @@ class TestRetryRateLimiting:
    ):
        """Test that notification failures don't break the rate limiter."""
        mock_client = Mock()
-        mock_client.system_alert.side_effect = Exception("Notification failed")
+        mock_client.discord_system_alert.side_effect = Exception("Notification failed")
        mock_get_client.return_value = mock_client

        exc = ValueError("test error")
@@ -221,7 +221,7 @@ class TestRetryRateLimiting:
            _send_critical_retry_alert(
                "_call_method_sync", 50, exc, "Service communication"
            )
-            assert mock_client.system_alert.call_count == 1
+            assert mock_client.discord_system_alert.call_count == 1

            # Next 950 failures should not send alerts (rate limited)
            for _ in range(950):
@@ -230,7 +230,7 @@ class TestRetryRateLimiting:
                )

            # Still only 1 alert sent total
-            assert mock_client.system_alert.call_count == 1
+            assert mock_client.discord_system_alert.call_count == 1

    @patch("backend.util.clients.get_notification_manager_client")
    def test_retry_decorator_with_excessive_failures(self, mock_get_client):
@@ -248,4 +248,4 @@ class TestRetryRateLimiting:
            always_failing_function()

        # Should have sent exactly one alert at the threshold
-        assert mock_client.system_alert.call_count == 1
+        assert mock_client.discord_system_alert.call_count == 1
--- a/autogpt_platform/backend/backend/util/settings.py
+++ b/autogpt_platform/backend/backend/util/settings.py
@@ -682,8 +682,6 @@ class Secrets(UpdateTrackingModel["Secrets"], BaseSettings):
        description="The LaunchDarkly SDK key for feature flag management",
    )

-    allquiet_webhook_url: str = Field(default="", description="AllQuiet webhook URL")
-
    ayrshare_api_key: str = Field(default="", description="Ayrshare API Key")
    ayrshare_jwt_key: str = Field(default="", description="Ayrshare private Key")