feat(backend): Avoid executor over-consuming messages when it's fully occupied (#10449)

When we run multiple instances of the executor, some of the executors
can oversubscribe the messages and end up queuing the agent execution
request instead of letting another executor handle the job. This change
solves the problem.

### Changes 🏗️

* Reject execution request when the executor is full.
* Improve `active_graph_runs` tracking for better horizontal scaling
heuristics.

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  <!-- Put your test plan here: -->
  - [x] Manual graph execution & CI
This commit is contained in:
Zamil Majdy
2025-07-29 07:08:27 +08:00
committed by GitHub
parent b71d0ec805
commit 4d05a27388
3 changed files with 123 additions and 15 deletions

View File

@@ -22,6 +22,48 @@ from backend.util.settings import Settings
logger = logging.getLogger(__name__)
# RabbitMQ Connection Constants
# These constants solve specific connection stability issues observed in production
# HEARTBEAT_INTERVAL_SYNC (30s = 30 seconds)
# Problem: Need to prove we're alive frequently, but also survive long network hiccups
# Solution: Frequent heartbeats (30s) with long detection window (60s = 1min detection)
# Use case: Long-running consumers need to stay connected but handle reconnection gracefully
# Trade-off: More network chatter, but proves liveness while allowing reconnection time
HEARTBEAT_INTERVAL_SYNC = 30
# HEARTBEAT_INTERVAL_ASYNC (60s = 1 minute)
# Problem: Same need for frequent proof-of-life with reasonable detection
# Solution: Frequent enough to prove activity, not too aggressive on detection
# Use case: Async RabbitMQ operations with good responsiveness
# Trade-off: Regular heartbeats with 2-minute detection window
HEARTBEAT_INTERVAL_ASYNC = 60
# BLOCKED_CONNECTION_TIMEOUT (300s = 5 minutes)
# Problem: Connection can hang indefinitely if RabbitMQ server is overloaded
# Solution: Timeout and reconnect if connection is blocked for too long
# Use case: Network issues or server resource constraints
BLOCKED_CONNECTION_TIMEOUT = 300
# SOCKET_TIMEOUT (30s)
# Problem: Network operations can hang indefinitely on poor connections
# Solution: Fail fast on socket operations to enable quick reconnection
# Use case: Network latency, packet loss, or connectivity issues
SOCKET_TIMEOUT = 30
# CONNECTION_ATTEMPTS (5 attempts)
# Problem: Temporary network issues cause permanent connection failures
# Solution: More retry attempts for better resilience during long executions
# Use case: Transient network issues during service startup or long-running operations
CONNECTION_ATTEMPTS = 5
# RETRY_DELAY (1 second)
# Problem: Immediate reconnection attempts can overwhelm the server
# Solution: Quick retry for faster recovery while still being respectful
# Use case: Faster reconnection for long-running executions that need to resume quickly
RETRY_DELAY = 1
class ExchangeType(str, Enum):
DIRECT = "direct"
@@ -117,8 +159,11 @@ class SyncRabbitMQ(RabbitMQBase):
port=self.port,
virtual_host=self.config.vhost,
credentials=credentials,
heartbeat=600,
blocked_connection_timeout=300,
heartbeat=HEARTBEAT_INTERVAL_SYNC,
blocked_connection_timeout=BLOCKED_CONNECTION_TIMEOUT,
socket_timeout=SOCKET_TIMEOUT,
connection_attempts=CONNECTION_ATTEMPTS,
retry_delay=RETRY_DELAY,
)
self._connection = pika.BlockingConnection(parameters)
@@ -227,6 +272,8 @@ class AsyncRabbitMQ(RabbitMQBase):
login=self.username,
password=self.password,
virtualhost=self.config.vhost.lstrip("/"),
heartbeat=HEARTBEAT_INTERVAL_ASYNC,
blocked_connection_timeout=BLOCKED_CONNECTION_TIMEOUT,
)
self._channel = await self._connection.channel()
await self._channel.set_qos(prefetch_count=1)

View File

@@ -205,8 +205,7 @@ async def execute_node(
yield output_name, output_data
except Exception as e:
error_msg = str(e)
yield "error", error_msg
yield "error", str(e) or type(e).__name__
raise e
finally:
@@ -514,7 +513,9 @@ class Executor:
@classmethod
@error_logged(swallow=False)
def on_graph_execution(
cls, graph_exec: GraphExecutionEntry, cancel: threading.Event
cls,
graph_exec: GraphExecutionEntry,
cancel: threading.Event,
):
log_metadata = LogMetadata(
logger=_logger,
@@ -1078,15 +1079,28 @@ class ExecutionManager(AppProcess):
@continuous_retry()
def _consume_execution_run(self):
# Long-running executions are handled by:
# 1. Connection-level heartbeats (30s interval) with quick detection (60s)
# 2. Disabled consumer timeout (x-consumer-timeout: 0) allows unlimited execution time
# 3. Enhanced connection settings (5 retries, 1s delay) for quick reconnection
# 4. Process monitoring ensures failed executors release messages back to queue
run_client = SyncRabbitMQ(create_execution_queue_config())
run_client.connect()
run_channel = run_client.get_channel()
run_channel.basic_qos(prefetch_count=self.pool_size)
# Configure consumer for long-running graph executions
# auto_ack=False: Don't acknowledge messages until execution completes (prevents data loss)
run_channel.basic_consume(
queue=GRAPH_EXECUTION_QUEUE_NAME,
on_message_callback=self._handle_run_message,
auto_ack=False,
consumer_tag="graph_execution_consumer",
)
run_channel.confirm_delivery()
logger.info(f"[{self.service_name}] ⏳ Starting to consume run messages...")
run_channel.start_consuming()
raise RuntimeError(f"❌ run message consumer is stopped: {run_channel}")
@@ -1136,6 +1150,13 @@ class ExecutionManager(AppProcess):
body: bytes,
):
delivery_tag = method.delivery_tag
# Check if we can accept more runs
self._cleanup_completed_runs()
if len(self.active_graph_runs) >= self.pool_size:
channel.basic_nack(delivery_tag, requeue=True)
return
try:
graph_exec_entry = GraphExecutionEntry.model_validate_json(body)
except Exception as e:
@@ -1155,6 +1176,7 @@ class ExecutionManager(AppProcess):
return
cancel_event = multiprocessing.Manager().Event()
future = self.executor.submit(
Executor.on_graph_execution, graph_exec_entry, cancel_event
)
@@ -1165,27 +1187,54 @@ class ExecutionManager(AppProcess):
def _on_run_done(f: Future):
logger.info(f"[{self.service_name}] Run completed for {graph_exec_id}")
try:
self.active_graph_runs.pop(graph_exec_id, None)
active_runs_gauge.set(len(self.active_graph_runs))
utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
if exec_error := f.exception():
logger.error(
f"[{self.service_name}] Execution for {graph_exec_id} failed: {exec_error}"
)
channel.connection.add_callback_threadsafe(
lambda: channel.basic_nack(delivery_tag, requeue=True)
)
try:
channel.connection.add_callback_threadsafe(
lambda: channel.basic_nack(delivery_tag, requeue=True)
)
except Exception as ack_error:
logger.error(
f"[{self.service_name}] Failed to NACK message for {graph_exec_id}: {ack_error}"
)
else:
channel.connection.add_callback_threadsafe(
lambda: channel.basic_ack(delivery_tag)
)
try:
channel.connection.add_callback_threadsafe(
lambda: channel.basic_ack(delivery_tag)
)
except Exception as ack_error:
logger.error(
f"[{self.service_name}] Failed to ACK message for {graph_exec_id}: {ack_error}"
)
except BaseException as e:
logger.exception(
f"[{self.service_name}] Error acknowledging message: {e}"
f"[{self.service_name}] Error in run completion callback: {e}"
)
finally:
self._cleanup_completed_runs()
future.add_done_callback(_on_run_done)
def _cleanup_completed_runs(self, log=logger.info) -> list[str]:
"""Remove completed futures from active_graph_runs and update metrics"""
completed_runs = []
for graph_exec_id, (future, _) in self.active_graph_runs.items():
if future.done():
completed_runs.append(graph_exec_id)
for geid in completed_runs:
log(f"[{self.service_name}] ✅ Cleaned up completed run {geid}")
self.active_graph_runs.pop(geid, None)
# Update metrics
active_count = len(self.active_graph_runs)
active_runs_gauge.set(active_count)
utilization_gauge.set(active_count / self.pool_size)
return completed_runs
def cleanup(self):
super().cleanup()
self._on_cleanup()
@@ -1202,6 +1251,9 @@ class ExecutionManager(AppProcess):
log(f"{prefix} ⏳ Shutting down GraphExec pool...")
self.executor.shutdown(cancel_futures=True, wait=False)
log(f"{prefix} ⏳ Cleaning up active graph runs...")
self._cleanup_completed_runs(log=log)
log(f"{prefix} ⏳ Disconnecting Redis...")
redis.disconnect()

View File

@@ -668,6 +668,15 @@ def create_execution_queue_config() -> RabbitMQConfig:
routing_key=GRAPH_EXECUTION_ROUTING_KEY,
durable=True,
auto_delete=False,
arguments={
# x-consumer-timeout (0 = disabled)
# Problem: Default 30-minute consumer timeout kills long-running graph executions
# Original error: "Consumer acknowledgement timed out after 1800000 ms (30 minutes)"
# Solution: Disable consumer timeout entirely - let graphs run indefinitely
# Safety: Heartbeat mechanism now handles dead consumer detection instead
# Use case: Graph executions that take hours to complete (AI model training, etc.)
"x-consumer-timeout": 0,
},
)
cancel_queue = Queue(
name=GRAPH_EXECUTION_CANCEL_QUEUE_NAME,