Compare commits

..

2 Commits

Author SHA1 Message Date
Nicholas Tindle
d58df37238 Merge branch 'dev' into fix/sentry-performance-integrations 2026-02-04 21:32:12 -06:00
Otto
9c41512944 feat(backend): Add Sentry FastAPI and HTTPX integrations for better performance tracing
Adds FastApiIntegration and HttpxIntegration to Sentry SDK initialization to enable:
- Detailed span tracking for FastAPI request handling
- Automatic tracing of outgoing HTTP calls (OpenAI, external APIs, etc.)

This improves visibility in Sentry Performance for debugging slow requests and identifying bottlenecks in external API calls.
2026-02-04 22:47:35 +00:00
9 changed files with 34 additions and 268 deletions

View File

@@ -1385,23 +1385,8 @@ class ExecutionProcessor:
f"[View User Details]({base_url}/admin/spending?search={user_email})"
)
# Send both Discord and AllQuiet alerts
correlation_id = f"insufficient_funds_{user_id}"
get_notification_manager_client().system_alert(
content=alert_message,
channel=DiscordChannel.PRODUCT,
correlation_id=correlation_id,
severity="critical",
status="open",
extra_attributes={
"user_id": user_id,
"user_email": user_email or "unknown",
"balance": f"${e.balance / 100:.2f}",
"attempted_cost": f"${abs(e.amount) / 100:.2f}",
"shortfall": f"${abs(shortfall) / 100:.2f}",
"agent_name": metadata.name if metadata else "Unknown",
},
get_notification_manager_client().discord_system_alert(
alert_message, DiscordChannel.PRODUCT
)
except Exception as alert_error:
logger.error(
@@ -1448,22 +1433,8 @@ class ExecutionProcessor:
f"Transaction cost: ${transaction_cost / 100:.2f}\n"
f"[View User Details]({base_url}/admin/spending?search={user_email})"
)
# Send both Discord and AllQuiet alerts
correlation_id = f"low_balance_{user_id}"
get_notification_manager_client().system_alert(
content=alert_message,
channel=DiscordChannel.PRODUCT,
correlation_id=correlation_id,
severity="warning",
status="open",
extra_attributes={
"user_id": user_id,
"user_email": user_email or "unknown",
"current_balance": f"${current_balance / 100:.2f}",
"transaction_cost": f"${transaction_cost / 100:.2f}",
"threshold": f"${LOW_BALANCE_THRESHOLD / 100:.2f}",
},
get_notification_manager_client().discord_system_alert(
alert_message, DiscordChannel.PRODUCT
)
except Exception as e:
logger.error(f"Failed to send low balance Discord alert: {e}")

View File

@@ -54,16 +54,13 @@ async def test_handle_low_balance_threshold_crossing(server: SpinTestServer):
assert isinstance(notification_call.data, LowBalanceData)
assert notification_call.data.current_balance == current_balance
# Verify system alert was sent (includes Discord and AllQuiet)
mock_client.system_alert.assert_called_once()
alert_call = mock_client.system_alert.call_args
discord_message = alert_call[1]["content"]
# Verify Discord alert was sent
mock_client.discord_system_alert.assert_called_once()
discord_message = mock_client.discord_system_alert.call_args[0][0]
assert "Low Balance Alert" in discord_message
assert "test@example.com" in discord_message
assert "$4.00" in discord_message
assert "$6.00" in discord_message
# Verify AllQuiet correlation ID is set
assert alert_call[1]["correlation_id"] == f"low_balance_{user_id}"
@pytest.mark.asyncio(loop_scope="session")
@@ -106,7 +103,7 @@ async def test_handle_low_balance_no_notification_when_not_crossing(
# Verify no notification was sent
mock_queue_notif.assert_not_called()
mock_client.system_alert.assert_not_called()
mock_client.discord_system_alert.assert_not_called()
@pytest.mark.asyncio(loop_scope="session")
@@ -149,4 +146,4 @@ async def test_handle_low_balance_no_duplicate_when_already_below(
# Verify no notification was sent (user was already below threshold)
mock_queue_notif.assert_not_called()
mock_client.system_alert.assert_not_called()
mock_client.discord_system_alert.assert_not_called()

View File

@@ -12,7 +12,7 @@ from backend.util.clients import (
get_database_manager_client,
get_notification_manager_client,
)
from backend.util.metrics import DiscordChannel, sentry_capture_error
from backend.util.metrics import sentry_capture_error
from backend.util.settings import Config
logger = logging.getLogger(__name__)
@@ -75,32 +75,7 @@ class BlockErrorMonitor:
if critical_alerts:
msg = "Block Error Rate Alert:\n\n" + "\n\n".join(critical_alerts)
# Send alert with correlation ID for block errors
# We'll create a simple hash of the block IDs that have errors
blocks_with_errors = [
stats.block_id
for name, stats in block_stats.items()
if stats.total_executions >= 10
and stats.error_rate >= threshold * 100
]
correlation_id = (
f"block_errors_{len(blocks_with_errors)}_blocks_{end_time.date()}"
)
self.notification_client.system_alert(
content=msg,
channel=DiscordChannel.PLATFORM,
correlation_id=correlation_id,
severity="warning",
status="open",
extra_attributes={
"blocks_affected": str(len(critical_alerts)),
"date": end_time.date().isoformat(),
"threshold": f"{threshold * 100}%",
},
)
self.notification_client.discord_system_alert(msg)
logger.info(
f"Sent block error rate alert for {len(critical_alerts)} blocks"
)
@@ -112,22 +87,7 @@ class BlockErrorMonitor:
block_stats, start_time, end_time
)
if top_blocks_msg:
# Daily summary gets a date-based correlation ID
correlation_id = f"block_error_daily_summary_{end_time.date()}"
self.notification_client.system_alert(
content=top_blocks_msg,
channel=DiscordChannel.PLATFORM,
correlation_id=correlation_id,
severity="minor",
status="open",
extra_attributes={
"type": "daily_summary",
"date": end_time.date().isoformat(),
"top_blocks_count": str(self.include_top_blocks),
},
)
self.notification_client.discord_system_alert(top_blocks_msg)
logger.info("Sent top blocks summary")
return "Sent top blocks summary"
@@ -140,22 +100,7 @@ class BlockErrorMonitor:
error = Exception(f"Error checking block error rates: {e}")
msg = str(error)
sentry_capture_error(error)
# Send error alert with generic correlation ID
correlation_id = "block_error_monitoring_failure"
self.notification_client.system_alert(
content=msg,
channel=DiscordChannel.PLATFORM,
correlation_id=correlation_id,
severity="critical",
status="open",
extra_attributes={
"error_type": type(e).__name__,
"error_message": str(e)[:200],
},
)
self.notification_client.discord_system_alert(msg)
return msg
def _get_block_stats_from_db(

View File

@@ -8,7 +8,7 @@ from backend.util.clients import (
get_database_manager_client,
get_notification_manager_client,
)
from backend.util.metrics import DiscordChannel, sentry_capture_error
from backend.util.metrics import sentry_capture_error
from backend.util.settings import Config
logger = logging.getLogger(__name__)
@@ -102,28 +102,7 @@ class LateExecutionMonitor:
msg = str(error)
sentry_capture_error(error)
# Generate correlation ID based on the threshold and number of late executions
correlation_id = f"late_execution_{self.config.execution_late_notification_threshold_secs}s_{num_total_late}_execs_{num_users}_users"
# Send both Discord and AllQuiet alerts
self.notification_client.system_alert(
content=msg,
channel=DiscordChannel.PLATFORM,
correlation_id=correlation_id,
severity="critical",
status="open",
extra_attributes={
"total_late_executions": str(num_total_late),
"queued_executions": str(num_queued),
"running_executions": str(num_running),
"affected_users": str(num_users),
"threshold_seconds": str(
self.config.execution_late_notification_threshold_secs
),
},
)
self.notification_client.discord_system_alert(msg)
return msg

View File

@@ -1,7 +1,7 @@
import asyncio
import logging
from datetime import datetime, timedelta, timezone
from typing import Awaitable, Callable, Literal
from typing import Awaitable, Callable
import aio_pika
from prisma.enums import NotificationType
@@ -33,12 +33,7 @@ from backend.data.user import (
from backend.notifications.email import EmailSender
from backend.util.clients import get_database_manager_async_client
from backend.util.logging import TruncatedLogger
from backend.util.metrics import (
AllQuietAlert,
DiscordChannel,
discord_send_alert,
send_allquiet_alert,
)
from backend.util.metrics import DiscordChannel, discord_send_alert
from backend.util.retry import continuous_retry
from backend.util.service import (
AppService,
@@ -422,54 +417,6 @@ class NotificationManager(AppService):
):
await discord_send_alert(content, channel)
@expose
async def allquiet_system_alert(self, alert: AllQuietAlert):
await send_allquiet_alert(alert)
@expose
async def system_alert(
self,
content: str,
channel: DiscordChannel = DiscordChannel.PLATFORM,
correlation_id: str | None = None,
severity: Literal["warning", "critical", "minor"] = "warning",
status: Literal["resolved", "open"] = "open",
extra_attributes: dict[str, str] | None = None,
):
"""Send both Discord and AllQuiet alerts for system events."""
# Send Discord alert
await discord_send_alert(content, channel)
# Send AllQuiet alert if correlation_id is provided
if correlation_id:
# Extract title from content (first line or first sentence)
lines = content.split("\n")
title = lines[0] if lines else content[:100]
# Remove Discord formatting from title
title = (
title.replace("**", "")
.replace("🚨", "")
.replace("⚠️", "")
.replace("", "")
.replace("", "")
.replace("📊", "")
.strip()
)
alert = AllQuietAlert(
severity=severity,
status=status,
title=title[:100], # Limit title length
description=content,
correlation_id=correlation_id,
channel=channel.value, # Pass channel as first-class field
extra_attributes=extra_attributes or {},
)
try:
await send_allquiet_alert(alert)
except Exception as e:
logger.error(f"Failed to send AllQuiet alert: {e}")
async def _queue_scheduled_notification(self, event: SummaryParamsEventModel):
"""Queue a scheduled notification - exposed method for other services to call"""
try:
@@ -1159,5 +1106,3 @@ class NotificationManagerClient(AppServiceClient):
)
queue_weekly_summary = endpoint_to_sync(NotificationManager.queue_weekly_summary)
discord_system_alert = endpoint_to_sync(NotificationManager.discord_system_alert)
allquiet_system_alert = endpoint_to_sync(NotificationManager.allquiet_system_alert)
system_alert = endpoint_to_sync(NotificationManager.system_alert)

View File

@@ -1,12 +1,13 @@
import logging
from enum import Enum
from typing import Literal
import sentry_sdk
from pydantic import BaseModel, Field, SecretStr
from pydantic import SecretStr
from sentry_sdk.integrations import DidNotEnable
from sentry_sdk.integrations.anthropic import AnthropicIntegration
from sentry_sdk.integrations.asyncio import AsyncioIntegration
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.httpx import HttpxIntegration
from sentry_sdk.integrations.launchdarkly import LaunchDarklyIntegration
from sentry_sdk.integrations.logging import LoggingIntegration
@@ -38,6 +39,8 @@ def sentry_init():
_experiments={"enable_logs": True},
integrations=[
AsyncioIntegration(),
FastApiIntegration(), # Traces FastAPI requests with detailed spans
HttpxIntegration(), # Traces outgoing HTTP calls (OpenAI, external APIs)
LoggingIntegration(sentry_logs_level=logging.INFO),
AnthropicIntegration(
include_prompts=False,
@@ -52,31 +55,6 @@ def sentry_capture_error(error: BaseException):
sentry_sdk.flush()
class AllQuietAlert(BaseModel):
severity: Literal["warning"] | Literal["critical"] | Literal["minor"]
status: Literal["resolved"] | Literal["open"]
title: str | None = None
description: str | None = None
correlation_id: str | None = None
channel: str | None = None # Discord channel (platform or product)
extra_attributes: dict[str, str] = Field(default_factory=dict)
environment: str = (
f"app:{settings.config.app_env.value}-behave:{settings.config.behave_as.value}"
)
async def send_allquiet_alert(alert: AllQuietAlert):
hook_url = settings.secrets.allquiet_webhook_url
if not hook_url:
logging.warning("AllQuiet webhook URL not configured")
return
from backend.util.request import Requests
await Requests().post(hook_url, json=alert.model_dump())
async def discord_send_alert(
content: str, channel: DiscordChannel = DiscordChannel.PLATFORM
):

View File

@@ -4,7 +4,6 @@ import os
import threading
import time
from functools import wraps
from typing import Literal
from uuid import uuid4
from tenacity import (
@@ -44,14 +43,7 @@ def should_send_alert(func_name: str, exception: Exception, context: str = "") -
def send_rate_limited_discord_alert(
func_name: str,
exception: Exception,
context: str,
alert_msg: str,
channel=None,
correlation_id: str | None = None,
severity: Literal["warning", "critical", "minor"] = "critical",
extra_attributes: dict | None = None,
func_name: str, exception: Exception, context: str, alert_msg: str, channel=None
) -> bool:
"""
Send a Discord alert with rate limiting.
@@ -66,13 +58,8 @@ def send_rate_limited_discord_alert(
from backend.util.metrics import DiscordChannel
notification_client = get_notification_manager_client()
notification_client.system_alert(
content=alert_msg,
channel=channel or DiscordChannel.PLATFORM,
correlation_id=correlation_id,
severity=severity,
status="open",
extra_attributes=extra_attributes or {},
notification_client.discord_system_alert(
alert_msg, channel or DiscordChannel.PLATFORM
)
return True
@@ -87,30 +74,14 @@ def _send_critical_retry_alert(
"""Send alert when a function is approaching the retry failure threshold."""
prefix = f"{context}: " if context else ""
error_type = type(exception).__name__
# Create correlation ID based on context, function name, and error type
correlation_id = f"retry_failure_{context}_{func_name}_{error_type}".replace(
" ", "_"
).replace(":", "")
if send_rate_limited_discord_alert(
func_name,
exception,
context,
f"🚨 CRITICAL: Operation Approaching Failure Threshold: {prefix}'{func_name}'\n\n"
f"Current attempt: {attempt_number}/{EXCESSIVE_RETRY_THRESHOLD}\n"
f"Error: {error_type}: {exception}\n\n"
f"Error: {type(exception).__name__}: {exception}\n\n"
f"This operation is about to fail permanently. Investigate immediately.",
correlation_id=correlation_id,
severity="critical",
extra_attributes={
"function_name": func_name,
"attempt_number": str(attempt_number),
"max_attempts": str(EXCESSIVE_RETRY_THRESHOLD),
"error_type": error_type,
"context": context or "none",
},
):
logger.critical(
f"CRITICAL ALERT SENT: Operation {func_name} at attempt {attempt_number}"
@@ -214,14 +185,6 @@ def conn_retry(
logger.error(f"{prefix} {action_name} failed after retries: {exception}")
else:
if attempt_number == EXCESSIVE_RETRY_THRESHOLD:
error_type = type(exception).__name__
# Create correlation ID for infrastructure issues
correlation_id = (
f"infrastructure_{resource_name}_{action_name}_{func_name}".replace(
" ", "_"
)
)
if send_rate_limited_discord_alert(
func_name,
exception,
@@ -231,18 +194,8 @@ def conn_retry(
f"Action: {action_name}\n"
f"Function: {func_name}\n"
f"Current attempt: {attempt_number}/{max_retry + 1}\n"
f"Error: {error_type}: {str(exception)[:200]}{'...' if len(str(exception)) > 200 else ''}\n\n"
f"Error: {type(exception).__name__}: {str(exception)[:200]}{'...' if len(str(exception)) > 200 else ''}\n\n"
f"Infrastructure component is approaching failure threshold. Investigate immediately.",
correlation_id=correlation_id,
severity="critical",
extra_attributes={
"resource_name": resource_name,
"action_name": action_name,
"function_name": func_name,
"attempt_number": str(attempt_number),
"max_attempts": str(max_retry + 1),
"error_type": error_type,
},
):
logger.critical(
f"INFRASTRUCTURE ALERT SENT: {resource_name} at {attempt_number} attempts"

View File

@@ -166,16 +166,16 @@ class TestRetryRateLimiting:
# First alert should be sent
_send_critical_retry_alert("spend_credits", 50, exc, "Service communication")
assert mock_client.system_alert.call_count == 1
assert mock_client.discord_system_alert.call_count == 1
# Second identical alert should be rate limited (not sent)
_send_critical_retry_alert("spend_credits", 50, exc, "Service communication")
assert mock_client.system_alert.call_count == 1 # Still 1, not 2
assert mock_client.discord_system_alert.call_count == 1 # Still 1, not 2
# Different error should be allowed
exc2 = ValueError("different API error")
_send_critical_retry_alert("spend_credits", 50, exc2, "Service communication")
assert mock_client.system_alert.call_count == 2
assert mock_client.discord_system_alert.call_count == 2
@patch("backend.util.clients.get_notification_manager_client")
def test_send_critical_retry_alert_handles_notification_failure(
@@ -183,7 +183,7 @@ class TestRetryRateLimiting:
):
"""Test that notification failures don't break the rate limiter."""
mock_client = Mock()
mock_client.system_alert.side_effect = Exception("Notification failed")
mock_client.discord_system_alert.side_effect = Exception("Notification failed")
mock_get_client.return_value = mock_client
exc = ValueError("test error")
@@ -221,7 +221,7 @@ class TestRetryRateLimiting:
_send_critical_retry_alert(
"_call_method_sync", 50, exc, "Service communication"
)
assert mock_client.system_alert.call_count == 1
assert mock_client.discord_system_alert.call_count == 1
# Next 950 failures should not send alerts (rate limited)
for _ in range(950):
@@ -230,7 +230,7 @@ class TestRetryRateLimiting:
)
# Still only 1 alert sent total
assert mock_client.system_alert.call_count == 1
assert mock_client.discord_system_alert.call_count == 1
@patch("backend.util.clients.get_notification_manager_client")
def test_retry_decorator_with_excessive_failures(self, mock_get_client):
@@ -248,4 +248,4 @@ class TestRetryRateLimiting:
always_failing_function()
# Should have sent exactly one alert at the threshold
assert mock_client.system_alert.call_count == 1
assert mock_client.discord_system_alert.call_count == 1

View File

@@ -682,8 +682,6 @@ class Secrets(UpdateTrackingModel["Secrets"], BaseSettings):
description="The LaunchDarkly SDK key for feature flag management",
)
allquiet_webhook_url: str = Field(default="", description="AllQuiet webhook URL")
ayrshare_api_key: str = Field(default="", description="Ayrshare API Key")
ayrshare_jwt_key: str = Field(default="", description="Ayrshare private Key")