mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(backend): Add thread safety to NodeExecutionProgress output handling (#10415)
## Summary - Add thread safety to NodeExecutionProgress class to prevent race conditions between graph executor and node executor threads - Fixes potential data corruption and lost outputs during concurrent access to shared output lists - Uses single global lock per node for minimal performance impact - Instead of blocking the node evaluation before adding another node evaluation, we move on to the next node, in case another node completes it. ## Changes - Added `threading.Lock` to NodeExecutionProgress class - Protected `add_output()` calls from node executor thread with lock - Protected `pop_output()` calls from graph executor thread with lock - Protected `_pop_done_task()` output checks with lock ## Problem Solved The `NodeExecutionProgress.output` dictionary was being accessed concurrently: - `add_output()` called from node executor thread (asyncio thread) - `pop_output()` called from graph executor thread (main thread) - Python lists are not thread-safe for concurrent append/pop operations - This could cause data corruption, index errors, and lost outputs ## Test Plan - [x] Existing executor tests pass - [x] No performance regression (operations are microsecond-level) - [x] Thread safety verified through code analysis ## Technical Details - Single `threading.Lock()` per NodeExecutionProgress instance (~64 bytes) - Lock acquisition time (~100-200ns) is minimal compared to list operations - Maintains order guarantees for same node_execution_id processing - No GIL contention issues as operations are very brief 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -783,11 +783,13 @@ class Executor:
|
||||
|
||||
# node evaluation future -----------------
|
||||
if inflight_eval := running_node_evaluation.get(node_id):
|
||||
try:
|
||||
inflight_eval.result()
|
||||
running_node_evaluation.pop(node_id)
|
||||
except TimeoutError:
|
||||
if not inflight_eval.done():
|
||||
continue
|
||||
try:
|
||||
inflight_eval.result(timeout=0)
|
||||
running_node_evaluation.pop(node_id)
|
||||
except Exception as e:
|
||||
log_metadata.error(f"Node eval #{node_id} failed: {e}")
|
||||
|
||||
# node execution future ---------------------------
|
||||
if inflight_exec.is_done():
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import Future
|
||||
@@ -885,12 +886,14 @@ class NodeExecutionProgress:
|
||||
self.output: dict[str, list[ExecutionOutputEntry]] = defaultdict(list)
|
||||
self.tasks: dict[str, Future] = {}
|
||||
self.on_done_task = on_done_task
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def add_task(self, node_exec_id: str, task: Future):
|
||||
self.tasks[node_exec_id] = task
|
||||
|
||||
def add_output(self, output: ExecutionOutputEntry):
|
||||
self.output[output.node_exec_id].append(output)
|
||||
with self._lock:
|
||||
self.output[output.node_exec_id].append(output)
|
||||
|
||||
def pop_output(self) -> ExecutionOutputEntry | None:
|
||||
exec_id = self._next_exec()
|
||||
@@ -900,8 +903,9 @@ class NodeExecutionProgress:
|
||||
if self._pop_done_task(exec_id):
|
||||
return self.pop_output()
|
||||
|
||||
if next_output := self.output[exec_id]:
|
||||
return next_output.pop(0)
|
||||
with self._lock:
|
||||
if next_output := self.output[exec_id]:
|
||||
return next_output.pop(0)
|
||||
|
||||
return None
|
||||
|
||||
@@ -966,8 +970,9 @@ class NodeExecutionProgress:
|
||||
if not task.done():
|
||||
return False
|
||||
|
||||
if self.output[exec_id]:
|
||||
return False
|
||||
with self._lock:
|
||||
if self.output[exec_id]:
|
||||
return False
|
||||
|
||||
if task := self.tasks.pop(exec_id):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user