From bf92e7dbc8afb4cd7cf827cfa55c890a95ff9f73 Mon Sep 17 00:00:00 2001 From: Zamil Majdy Date: Sun, 17 Aug 2025 02:06:06 +0400 Subject: [PATCH] hotfix(backend/executor): Fix RabbitMQ channel retry logic in executor (#10661) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary **HOTFIX for production** - Fixes executor being stuck in infinite retry loop when RabbitMQ channels are closed - Ensures proper reconnection by checking channel state before attempting to consume messages - Prevents accumulation of thousands of retry attempts (was seeing 7000+ retries) ## Changes The executor was stuck repeatedly failing with "Channel is closed" errors because the `continuous_retry` decorator was attempting to reuse closed channels instead of creating new ones. Added channel state checks (`is_ready`) before connecting in both: - `_consume_execution_run()` - `_consume_execution_cancel()` When a channel is not ready (closed), the code now: 1. Disconnects the client (safe operation, checks if already disconnected) 2. Establishes a fresh connection with new channel 3. Proceeds with message consumption ## Test plan - [x] Verified the disconnect() method is safe to call on already disconnected clients - [x] Confirmed is_ready property checks both connection and channel state - [ ] Deploy to environment and verify executors reconnect properly after channel failures - [ ] Monitor logs to ensure no more "Channel is closed" retry loops ## Related Issues Fixes critical production issue where: - Executor pods show repeated "Channel is closed" errors - 757 messages stuck in `graph_execution_queue` - 102,286 messages in `failed_notifications` queue - RabbitMQ logs show connections being closed due to missed heartbeats 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude --- autogpt_platform/backend/backend/executor/manager.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/autogpt_platform/backend/backend/executor/manager.py b/autogpt_platform/backend/backend/executor/manager.py index c5c45c2345..cc142bacd1 100644 --- a/autogpt_platform/backend/backend/executor/manager.py +++ b/autogpt_platform/backend/backend/executor/manager.py @@ -1208,6 +1208,9 @@ class ExecutionManager(AppProcess): ) return + # Check if channel is closed and force reconnection if needed + if not self.cancel_client.is_ready: + self.cancel_client.disconnect() self.cancel_client.connect() cancel_channel = self.cancel_client.get_channel() cancel_channel.basic_consume( @@ -1237,6 +1240,9 @@ class ExecutionManager(AppProcess): ) return + # Check if channel is closed and force reconnection if needed + if not self.run_client.is_ready: + self.run_client.disconnect() self.run_client.connect() run_channel = self.run_client.get_channel() run_channel.basic_qos(prefetch_count=self.pool_size)