diff --git a/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py b/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py index 6a0fc87964..fe0ccdae91 100644 --- a/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py +++ b/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py @@ -29,7 +29,9 @@ class LateExecutionMonitor: def check_late_executions(self) -> str: """Check for late executions and send alerts if found.""" - late_executions = execution_utils.get_db_client().get_graph_executions( + + # Check for QUEUED executions + queued_late_executions = execution_utils.get_db_client().get_graph_executions( statuses=[ExecutionStatus.QUEUED], created_time_gte=datetime.now(timezone.utc) - timedelta( @@ -40,24 +42,59 @@ class LateExecutionMonitor: limit=1000, ) - if not late_executions: + # Check for RUNNING executions stuck for more than 24 hours + running_late_executions = execution_utils.get_db_client().get_graph_executions( + statuses=[ExecutionStatus.RUNNING], + created_time_gte=datetime.now(timezone.utc) + - timedelta(hours=24) + - timedelta( + seconds=self.config.execution_late_notification_checkrange_secs + ), + created_time_lte=datetime.now(timezone.utc) - timedelta(hours=24), + limit=1000, + ) + + all_late_executions = queued_late_executions + running_late_executions + + if not all_late_executions: return "No late executions detected." - num_late_executions = len(late_executions) - num_users = len(set([r.user_id for r in late_executions])) + # Sort by created time (oldest first) + all_late_executions.sort(key=lambda x: x.started_at) + + num_total_late = len(all_late_executions) + num_queued = len(queued_late_executions) + num_running = len(running_late_executions) + num_users = len(set([r.user_id for r in all_late_executions])) + + # Truncate to max 100 entries + truncated_executions = all_late_executions[:100] + was_truncated = num_total_late > 100 late_execution_details = [ - f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Created At: {exec.started_at.isoformat()}`" - for exec in late_executions + f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Status: {exec.status}, Created At: {exec.started_at.isoformat()}`" + for exec in truncated_executions ] - error = LateExecutionException( - f"Late executions detected: {num_late_executions} late executions from {num_users} users " - f"in the last {self.config.execution_late_notification_checkrange_secs} seconds. " - f"Graph has been queued for more than {self.config.execution_late_notification_threshold_secs} seconds. " - "Please check the executor status. Details:\n" - + "\n".join(late_execution_details) + message_parts = [ + f"Late executions detected: {num_total_late} total late executions ({num_queued} QUEUED, {num_running} RUNNING) from {num_users} users.", + f"QUEUED executions have been waiting for more than {self.config.execution_late_notification_threshold_secs} seconds.", + "RUNNING executions have been running for more than 24 hours.", + "Please check the executor status.", + ] + + if was_truncated: + message_parts.append( + f"\nShowing first 100 of {num_total_late} late executions:" + ) + else: + message_parts.append("\nDetails:") + + error_message = ( + "\n".join(message_parts) + "\n" + "\n".join(late_execution_details) ) + + error = LateExecutionException(error_message) msg = str(error) sentry_capture_error(error)