mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-10 23:58:06 -05:00
feat: Add alert for notifying stuck running agent for more than a day (#10438)
We've been reporting agents that are stuck on the `QUEUED` status. This change includes the one on. RUNNING status that's been stuck for more than 24hours. ### Changes 🏗️ Report agent on RUNNING status for more than 24hours. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: <!-- Put your test plan here: --> - [x] Manual test
This commit is contained in:
@@ -29,7 +29,9 @@ class LateExecutionMonitor:
|
||||
|
||||
def check_late_executions(self) -> str:
|
||||
"""Check for late executions and send alerts if found."""
|
||||
late_executions = execution_utils.get_db_client().get_graph_executions(
|
||||
|
||||
# Check for QUEUED executions
|
||||
queued_late_executions = execution_utils.get_db_client().get_graph_executions(
|
||||
statuses=[ExecutionStatus.QUEUED],
|
||||
created_time_gte=datetime.now(timezone.utc)
|
||||
- timedelta(
|
||||
@@ -40,24 +42,59 @@ class LateExecutionMonitor:
|
||||
limit=1000,
|
||||
)
|
||||
|
||||
if not late_executions:
|
||||
# Check for RUNNING executions stuck for more than 24 hours
|
||||
running_late_executions = execution_utils.get_db_client().get_graph_executions(
|
||||
statuses=[ExecutionStatus.RUNNING],
|
||||
created_time_gte=datetime.now(timezone.utc)
|
||||
- timedelta(hours=24)
|
||||
- timedelta(
|
||||
seconds=self.config.execution_late_notification_checkrange_secs
|
||||
),
|
||||
created_time_lte=datetime.now(timezone.utc) - timedelta(hours=24),
|
||||
limit=1000,
|
||||
)
|
||||
|
||||
all_late_executions = queued_late_executions + running_late_executions
|
||||
|
||||
if not all_late_executions:
|
||||
return "No late executions detected."
|
||||
|
||||
num_late_executions = len(late_executions)
|
||||
num_users = len(set([r.user_id for r in late_executions]))
|
||||
# Sort by created time (oldest first)
|
||||
all_late_executions.sort(key=lambda x: x.started_at)
|
||||
|
||||
num_total_late = len(all_late_executions)
|
||||
num_queued = len(queued_late_executions)
|
||||
num_running = len(running_late_executions)
|
||||
num_users = len(set([r.user_id for r in all_late_executions]))
|
||||
|
||||
# Truncate to max 100 entries
|
||||
truncated_executions = all_late_executions[:100]
|
||||
was_truncated = num_total_late > 100
|
||||
|
||||
late_execution_details = [
|
||||
f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Created At: {exec.started_at.isoformat()}`"
|
||||
for exec in late_executions
|
||||
f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Status: {exec.status}, Created At: {exec.started_at.isoformat()}`"
|
||||
for exec in truncated_executions
|
||||
]
|
||||
|
||||
error = LateExecutionException(
|
||||
f"Late executions detected: {num_late_executions} late executions from {num_users} users "
|
||||
f"in the last {self.config.execution_late_notification_checkrange_secs} seconds. "
|
||||
f"Graph has been queued for more than {self.config.execution_late_notification_threshold_secs} seconds. "
|
||||
"Please check the executor status. Details:\n"
|
||||
+ "\n".join(late_execution_details)
|
||||
message_parts = [
|
||||
f"Late executions detected: {num_total_late} total late executions ({num_queued} QUEUED, {num_running} RUNNING) from {num_users} users.",
|
||||
f"QUEUED executions have been waiting for more than {self.config.execution_late_notification_threshold_secs} seconds.",
|
||||
"RUNNING executions have been running for more than 24 hours.",
|
||||
"Please check the executor status.",
|
||||
]
|
||||
|
||||
if was_truncated:
|
||||
message_parts.append(
|
||||
f"\nShowing first 100 of {num_total_late} late executions:"
|
||||
)
|
||||
else:
|
||||
message_parts.append("\nDetails:")
|
||||
|
||||
error_message = (
|
||||
"\n".join(message_parts) + "\n" + "\n".join(late_execution_details)
|
||||
)
|
||||
|
||||
error = LateExecutionException(error_message)
|
||||
msg = str(error)
|
||||
|
||||
sentry_capture_error(error)
|
||||
|
||||
Reference in New Issue
Block a user