feat: Add alert for notifying stuck running agent for more than a day (#10438)

We've been reporting agents that are stuck on the `QUEUED` status. This
change includes the one on. RUNNING status that's been stuck for more
than 24hours.

### Changes 🏗️

Report agent on RUNNING status for more than 24hours.

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  <!-- Put your test plan here: -->
  - [x] Manual test
This commit is contained in:
Zamil Majdy
2025-07-23 22:35:01 +08:00
committed by GitHub
parent 51a39ef81a
commit ccad66427c

View File

@@ -29,7 +29,9 @@ class LateExecutionMonitor:
def check_late_executions(self) -> str:
"""Check for late executions and send alerts if found."""
late_executions = execution_utils.get_db_client().get_graph_executions(
# Check for QUEUED executions
queued_late_executions = execution_utils.get_db_client().get_graph_executions(
statuses=[ExecutionStatus.QUEUED],
created_time_gte=datetime.now(timezone.utc)
- timedelta(
@@ -40,24 +42,59 @@ class LateExecutionMonitor:
limit=1000,
)
if not late_executions:
# Check for RUNNING executions stuck for more than 24 hours
running_late_executions = execution_utils.get_db_client().get_graph_executions(
statuses=[ExecutionStatus.RUNNING],
created_time_gte=datetime.now(timezone.utc)
- timedelta(hours=24)
- timedelta(
seconds=self.config.execution_late_notification_checkrange_secs
),
created_time_lte=datetime.now(timezone.utc) - timedelta(hours=24),
limit=1000,
)
all_late_executions = queued_late_executions + running_late_executions
if not all_late_executions:
return "No late executions detected."
num_late_executions = len(late_executions)
num_users = len(set([r.user_id for r in late_executions]))
# Sort by created time (oldest first)
all_late_executions.sort(key=lambda x: x.started_at)
num_total_late = len(all_late_executions)
num_queued = len(queued_late_executions)
num_running = len(running_late_executions)
num_users = len(set([r.user_id for r in all_late_executions]))
# Truncate to max 100 entries
truncated_executions = all_late_executions[:100]
was_truncated = num_total_late > 100
late_execution_details = [
f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Created At: {exec.started_at.isoformat()}`"
for exec in late_executions
f"* `Execution ID: {exec.id}, Graph ID: {exec.graph_id}v{exec.graph_version}, User ID: {exec.user_id}, Status: {exec.status}, Created At: {exec.started_at.isoformat()}`"
for exec in truncated_executions
]
error = LateExecutionException(
f"Late executions detected: {num_late_executions} late executions from {num_users} users "
f"in the last {self.config.execution_late_notification_checkrange_secs} seconds. "
f"Graph has been queued for more than {self.config.execution_late_notification_threshold_secs} seconds. "
"Please check the executor status. Details:\n"
+ "\n".join(late_execution_details)
message_parts = [
f"Late executions detected: {num_total_late} total late executions ({num_queued} QUEUED, {num_running} RUNNING) from {num_users} users.",
f"QUEUED executions have been waiting for more than {self.config.execution_late_notification_threshold_secs} seconds.",
"RUNNING executions have been running for more than 24 hours.",
"Please check the executor status.",
]
if was_truncated:
message_parts.append(
f"\nShowing first 100 of {num_total_late} late executions:"
)
else:
message_parts.append("\nDetails:")
error_message = (
"\n".join(message_parts) + "\n" + "\n".join(late_execution_details)
)
error = LateExecutionException(error_message)
msg = str(error)
sentry_capture_error(error)