feat: Add alert for notifying stuck running agent for more than a day (#10438)

We've been reporting agents that are stuck on the `QUEUED` status. This change includes the one on. RUNNING status that's been stuck for more than 24hours. ### Changes 🏗️ Report agent on RUNNING status for more than 24hours. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan:  - [x] Manual test
2026-01-10 23:58:06 -05:00 · 2025-07-23 22:35:01 +08:00
parent 51a39ef81a
commit ccad66427c
1 changed files with 49 additions and 12 deletions
--- a/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py
+++ b/autogpt_platform/backend/backend/monitoring/late_execution_monitor.py
@@ -29,7 +29,9 @@ class LateExecutionMonitor:

    def check_late_executions(self) -> str:
        """Check for late executions and send alerts if found."""
-        late_executions = execution_utils.get_db_client().get_graph_executions(
+
+        # Check for QUEUED executions
+        queued_late_executions = execution_utils.get_db_client().get_graph_executions(
            statuses=[ExecutionStatus.QUEUED],
            created_time_gte=datetime.now(timezone.utc)
            - timedelta(
@@ -40,24 +42,59 @@ class LateExecutionMonitor:
            limit=1000,
        )

-        if not late_executions:
+        # Check for RUNNING executions stuck for more than 24 hours
+        running_late_executions = execution_utils.get_db_client().get_graph_executions(
+            statuses=[ExecutionStatus.RUNNING],
+            created_time_gte=datetime.now(timezone.utc)
+            - timedelta(hours=24)
+            - timedelta(
+                seconds=self.config.execution_late_notification_checkrange_secs
+            ),
+            created_time_lte=datetime.now(timezone.utc) - timedelta(hours=24),
+            limit=1000,
+        )
+
+        all_late_executions = queued_late_executions + running_late_executions
+
+        if not all_late_executions:
            return "No late executions detected."

-        num_late_executions = len(late_executions)
-        num_users = len(set([r.user_id for r in late_executions]))
+        # Sort by created time (oldest first)
+        all_late_executions.sort(key=lambda x: x.started_at)
+
+        num_total_late = len(all_late_executions)
+        num_queued = len(queued_late_executions)
+        num_running = len(running_late_executions)
+        num_users = len(set([r.user_id for r in all_late_executions]))
+
+        # Truncate to max 100 entries
+        truncated_executions = all_late_executions[:100]
+        was_truncated = num_total_late > 100

        late_execution_details = [
-            f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Created At: {exec.started_at.isoformat()}`"
-            for exec in late_executions
+            f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Status: {exec.status}, Created At: {exec.started_at.isoformat()}`"
+            for exec in truncated_executions
        ]

-        error = LateExecutionException(
-            f"Late executions detected: {num_late_executions} late executions from {num_users} users "
-            f"in the last {self.config.execution_late_notification_checkrange_secs} seconds. "
-            f"Graph has been queued for more than {self.config.execution_late_notification_threshold_secs} seconds. "
-            "Please check the executor status. Details:\n"
-            + "\n".join(late_execution_details)
+        message_parts = [
+            f"Late executions detected: {num_total_late} total late executions ({num_queued} QUEUED, {num_running} RUNNING) from {num_users} users.",
+            f"QUEUED executions have been waiting for more than {self.config.execution_late_notification_threshold_secs} seconds.",
+            "RUNNING executions have been running for more than 24 hours.",
+            "Please check the executor status.",
+        ]
+
+        if was_truncated:
+            message_parts.append(
+                f"\nShowing first 100 of {num_total_late} late executions:"
+            )
+        else:
+            message_parts.append("\nDetails:")
+
+        error_message = (
+            "\n".join(message_parts) + "\n" + "\n".join(late_execution_details)
        )
+
+        error = LateExecutionException(error_message)
        msg = str(error)

        sentry_capture_error(error)