feat(backend): Avoid executor over-consuming messages when it's fully occupied (#10449)

When we run multiple instances of the executor, some of the executors can oversubscribe the messages and end up queuing the agent execution request instead of letting another executor handle the job. This change solves the problem. ### Changes 🏗️ * Reject execution request when the executor is full. * Improve `active_graph_runs` tracking for better horizontal scaling heuristics. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan:  - [x] Manual graph execution & CI
2026-04-08 03:00:28 -04:00 · 2025-07-29 07:08:27 +08:00
parent b71d0ec805
commit 4d05a27388
3 changed files with 123 additions and 15 deletions
--- a/autogpt_platform/backend/backend/data/rabbitmq.py
+++ b/autogpt_platform/backend/backend/data/rabbitmq.py
@@ -22,6 +22,48 @@ from backend.util.settings import Settings

 logger = logging.getLogger(__name__)

+# RabbitMQ Connection Constants
+# These constants solve specific connection stability issues observed in production
+
+# HEARTBEAT_INTERVAL_SYNC (30s = 30 seconds)
+# Problem: Need to prove we're alive frequently, but also survive long network hiccups
+# Solution: Frequent heartbeats (30s) with long detection window (60s = 1min detection)
+# Use case: Long-running consumers need to stay connected but handle reconnection gracefully
+# Trade-off: More network chatter, but proves liveness while allowing reconnection time
+HEARTBEAT_INTERVAL_SYNC = 30
+
+# HEARTBEAT_INTERVAL_ASYNC (60s = 1 minute)
+# Problem: Same need for frequent proof-of-life with reasonable detection
+# Solution: Frequent enough to prove activity, not too aggressive on detection
+# Use case: Async RabbitMQ operations with good responsiveness
+# Trade-off: Regular heartbeats with 2-minute detection window
+HEARTBEAT_INTERVAL_ASYNC = 60
+
+
+# BLOCKED_CONNECTION_TIMEOUT (300s = 5 minutes)
+# Problem: Connection can hang indefinitely if RabbitMQ server is overloaded
+# Solution: Timeout and reconnect if connection is blocked for too long
+# Use case: Network issues or server resource constraints
+BLOCKED_CONNECTION_TIMEOUT = 300
+
+# SOCKET_TIMEOUT (30s)
+# Problem: Network operations can hang indefinitely on poor connections
+# Solution: Fail fast on socket operations to enable quick reconnection
+# Use case: Network latency, packet loss, or connectivity issues
+SOCKET_TIMEOUT = 30
+
+# CONNECTION_ATTEMPTS (5 attempts)
+# Problem: Temporary network issues cause permanent connection failures
+# Solution: More retry attempts for better resilience during long executions
+# Use case: Transient network issues during service startup or long-running operations
+CONNECTION_ATTEMPTS = 5
+
+# RETRY_DELAY (1 second)
+# Problem: Immediate reconnection attempts can overwhelm the server
+# Solution: Quick retry for faster recovery while still being respectful
+# Use case: Faster reconnection for long-running executions that need to resume quickly
+RETRY_DELAY = 1
+

 class ExchangeType(str, Enum):
    DIRECT = "direct"
@@ -117,8 +159,11 @@ class SyncRabbitMQ(RabbitMQBase):
            port=self.port,
            virtual_host=self.config.vhost,
            credentials=credentials,
-            heartbeat=600,
-            blocked_connection_timeout=300,
+            heartbeat=HEARTBEAT_INTERVAL_SYNC,
+            blocked_connection_timeout=BLOCKED_CONNECTION_TIMEOUT,
+            socket_timeout=SOCKET_TIMEOUT,
+            connection_attempts=CONNECTION_ATTEMPTS,
+            retry_delay=RETRY_DELAY,
        )

        self._connection = pika.BlockingConnection(parameters)
@@ -227,6 +272,8 @@ class AsyncRabbitMQ(RabbitMQBase):
            login=self.username,
            password=self.password,
            virtualhost=self.config.vhost.lstrip("/"),
+            heartbeat=HEARTBEAT_INTERVAL_ASYNC,
+            blocked_connection_timeout=BLOCKED_CONNECTION_TIMEOUT,
        )
        self._channel = await self._connection.channel()
        await self._channel.set_qos(prefetch_count=1)
--- a/autogpt_platform/backend/backend/executor/manager.py
+++ b/autogpt_platform/backend/backend/executor/manager.py
@@ -205,8 +205,7 @@ async def execute_node(
            yield output_name, output_data

    except Exception as e:
-        error_msg = str(e)
-        yield "error", error_msg
+        yield "error", str(e) or type(e).__name__
        raise e

    finally:
@@ -514,7 +513,9 @@ class Executor:
    @classmethod
    @error_logged(swallow=False)
    def on_graph_execution(
-        cls, graph_exec: GraphExecutionEntry, cancel: threading.Event
+        cls,
+        graph_exec: GraphExecutionEntry,
+        cancel: threading.Event,
    ):
        log_metadata = LogMetadata(
            logger=_logger,
@@ -1078,15 +1079,28 @@ class ExecutionManager(AppProcess):

    @continuous_retry()
    def _consume_execution_run(self):
+
+        # Long-running executions are handled by:
+        # 1. Connection-level heartbeats (30s interval) with quick detection (60s)
+        # 2. Disabled consumer timeout (x-consumer-timeout: 0) allows unlimited execution time
+        # 3. Enhanced connection settings (5 retries, 1s delay) for quick reconnection
+        # 4. Process monitoring ensures failed executors release messages back to queue
+
        run_client = SyncRabbitMQ(create_execution_queue_config())
        run_client.connect()
        run_channel = run_client.get_channel()
        run_channel.basic_qos(prefetch_count=self.pool_size)
+
+        # Configure consumer for long-running graph executions
+        # auto_ack=False: Don't acknowledge messages until execution completes (prevents data loss)
        run_channel.basic_consume(
            queue=GRAPH_EXECUTION_QUEUE_NAME,
            on_message_callback=self._handle_run_message,
            auto_ack=False,
+            consumer_tag="graph_execution_consumer",
        )
+        run_channel.confirm_delivery()
+
        logger.info(f"[{self.service_name}] ⏳ Starting to consume run messages...")
        run_channel.start_consuming()
        raise RuntimeError(f"❌ run message consumer is stopped: {run_channel}")
@@ -1136,6 +1150,13 @@ class ExecutionManager(AppProcess):
        body: bytes,
    ):
        delivery_tag = method.delivery_tag
+
+        # Check if we can accept more runs
+        self._cleanup_completed_runs()
+        if len(self.active_graph_runs) >= self.pool_size:
+            channel.basic_nack(delivery_tag, requeue=True)
+            return
+
        try:
            graph_exec_entry = GraphExecutionEntry.model_validate_json(body)
        except Exception as e:
@@ -1155,6 +1176,7 @@ class ExecutionManager(AppProcess):
            return

        cancel_event = multiprocessing.Manager().Event()
+
        future = self.executor.submit(
            Executor.on_graph_execution, graph_exec_entry, cancel_event
        )
@@ -1165,27 +1187,54 @@ class ExecutionManager(AppProcess):
        def _on_run_done(f: Future):
            logger.info(f"[{self.service_name}] Run completed for {graph_exec_id}")
            try:
-                self.active_graph_runs.pop(graph_exec_id, None)
-                active_runs_gauge.set(len(self.active_graph_runs))
-                utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
                if exec_error := f.exception():
                    logger.error(
                        f"[{self.service_name}] Execution for {graph_exec_id} failed: {exec_error}"
                    )
-                    channel.connection.add_callback_threadsafe(
-                        lambda: channel.basic_nack(delivery_tag, requeue=True)
-                    )
+                    try:
+                        channel.connection.add_callback_threadsafe(
+                            lambda: channel.basic_nack(delivery_tag, requeue=True)
+                        )
+                    except Exception as ack_error:
+                        logger.error(
+                            f"[{self.service_name}] Failed to NACK message for {graph_exec_id}: {ack_error}"
+                        )
                else:
-                    channel.connection.add_callback_threadsafe(
-                        lambda: channel.basic_ack(delivery_tag)
-                    )
+                    try:
+                        channel.connection.add_callback_threadsafe(
+                            lambda: channel.basic_ack(delivery_tag)
+                        )
+                    except Exception as ack_error:
+                        logger.error(
+                            f"[{self.service_name}] Failed to ACK message for {graph_exec_id}: {ack_error}"
+                        )
            except BaseException as e:
                logger.exception(
-                    f"[{self.service_name}] Error acknowledging message: {e}"
+                    f"[{self.service_name}] Error in run completion callback: {e}"
                )
+            finally:
+                self._cleanup_completed_runs()

        future.add_done_callback(_on_run_done)

+    def _cleanup_completed_runs(self, log=logger.info) -> list[str]:
+        """Remove completed futures from active_graph_runs and update metrics"""
+        completed_runs = []
+        for graph_exec_id, (future, _) in self.active_graph_runs.items():
+            if future.done():
+                completed_runs.append(graph_exec_id)
+
+        for geid in completed_runs:
+            log(f"[{self.service_name}] ✅ Cleaned up completed run {geid}")
+            self.active_graph_runs.pop(geid, None)
+
+        # Update metrics
+        active_count = len(self.active_graph_runs)
+        active_runs_gauge.set(active_count)
+        utilization_gauge.set(active_count / self.pool_size)
+
+        return completed_runs
+
    def cleanup(self):
        super().cleanup()
        self._on_cleanup()
@@ -1202,6 +1251,9 @@ class ExecutionManager(AppProcess):
            log(f"{prefix} ⏳ Shutting down GraphExec pool...")
            self.executor.shutdown(cancel_futures=True, wait=False)

+        log(f"{prefix} ⏳ Cleaning up active graph runs...")
+        self._cleanup_completed_runs(log=log)
+
        log(f"{prefix} ⏳ Disconnecting Redis...")
        redis.disconnect()

--- a/autogpt_platform/backend/backend/executor/utils.py
+++ b/autogpt_platform/backend/backend/executor/utils.py
@@ -668,6 +668,15 @@ def create_execution_queue_config() -> RabbitMQConfig:
        routing_key=GRAPH_EXECUTION_ROUTING_KEY,
        durable=True,
        auto_delete=False,
+        arguments={
+            # x-consumer-timeout (0 = disabled)
+            # Problem: Default 30-minute consumer timeout kills long-running graph executions
+            # Original error: "Consumer acknowledgement timed out after 1800000 ms (30 minutes)"
+            # Solution: Disable consumer timeout entirely - let graphs run indefinitely
+            # Safety: Heartbeat mechanism now handles dead consumer detection instead
+            # Use case: Graph executions that take hours to complete (AI model training, etc.)
+            "x-consumer-timeout": 0,
+        },
    )
    cancel_queue = Queue(
        name=GRAPH_EXECUTION_CANCEL_QUEUE_NAME,