mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-18 18:44:42 -05:00
Uncouple Copilot task execution from the REST API server. This should help performance and scalability, and allows task execution to continue regardless of the state of the user's connection. - Resolves #12023 ### Changes 🏗️ - Add `backend.copilot.executor`->`CoPilotExecutor` (setup similar to `backend.executor`->`ExecutionManager`). This executor service uses RabbitMQ-based task distribution, and sticks with the existing Redis Streams setup for task output. It uses a cluster lock mechanism to ensure a task is only executed by one pod, and the `DatabaseManager` for pooled DB access. - Add `backend.data.db_accessors` for automatic choice of direct/proxied DB access Chat requests now flow: API → RabbitMQ → CoPilot Executor → Redis Streams → SSE Client. This enables horizontal scaling of chat processing and isolates long-running LLM operations from the API service. - Move non-API Copilot stuff into `backend.copilot` (from `backend.api.features.chat`) - Updated import paths for all usages - Move `backend.executor.database` to `backend.data.db_manager` and add methods for copilot executor - Updated import paths for all usages - Make `backend.copilot.db` RPC-compatible (-> DB ops return ~~Prisma~~ Pydantic models) - Make `backend.data.workspace` RPC-compatible - Make `backend.data.graphs.get_store_listed_graphs` RPC-compatible DX: - Add `copilot_executor` service to Docker setup Config: - Add `Config.num_copilot_workers` (default 5) and `Config.copilot_executor_port` (default 8008) - Remove unused `Config.agent_server_port` > [!WARNING] > **This change adds a new microservice to the system, with entrypoint `backend.copilot.executor`.** > The `docker compose` setup has been updated, but if you run the Platform on something else, you'll have to update your deployment config to include this new service. > > When running locally, the `CoPilotExecutor` uses port 8008 by default. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Copilot works - [x] Processes messages when triggered - [x] Can use its tools #### For configuration changes: - [x] `.env.default` is updated or already compatible with my changes - [x] `docker-compose.yml` is updated or already compatible with my changes - [x] I have included a list of my configuration changes in the PR description (under **Changes**) --------- Co-authored-by: Zamil Majdy <zamil.majdy@agpt.co>
129 lines
4.2 KiB
Python
129 lines
4.2 KiB
Python
"""Redis-based distributed locking for cluster coordination."""
|
|
|
|
import logging
|
|
import threading
|
|
import time
|
|
from typing import TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from redis import Redis
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ClusterLock:
|
|
"""Simple Redis-based distributed lock for preventing duplicate execution."""
|
|
|
|
def __init__(self, redis: "Redis", key: str, owner_id: str, timeout: int = 300):
|
|
self.redis = redis
|
|
self.key = key
|
|
self.owner_id = owner_id
|
|
self.timeout = timeout
|
|
self._last_refresh = 0.0
|
|
self._refresh_lock = threading.Lock()
|
|
|
|
def try_acquire(self) -> str | None:
|
|
"""Try to acquire the lock.
|
|
|
|
Returns:
|
|
- owner_id (self.owner_id) if successfully acquired
|
|
- different owner_id if someone else holds the lock
|
|
- None if Redis is unavailable or other error
|
|
"""
|
|
try:
|
|
success = self.redis.set(self.key, self.owner_id, nx=True, ex=self.timeout)
|
|
if success:
|
|
with self._refresh_lock:
|
|
self._last_refresh = time.time()
|
|
return self.owner_id # Successfully acquired
|
|
|
|
# Failed to acquire, get current owner
|
|
current_value = self.redis.get(self.key)
|
|
if current_value:
|
|
current_owner = (
|
|
current_value.decode("utf-8")
|
|
if isinstance(current_value, bytes)
|
|
else str(current_value)
|
|
)
|
|
return current_owner
|
|
|
|
# Key doesn't exist but we failed to set it - race condition or Redis issue
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"ClusterLock.try_acquire failed for key {self.key}: {e}")
|
|
return None
|
|
|
|
def refresh(self) -> bool:
|
|
"""Refresh lock TTL if we still own it.
|
|
|
|
Rate limited to at most once every timeout/10 seconds (minimum 1 second).
|
|
During rate limiting, still verifies lock existence but skips TTL extension.
|
|
Setting _last_refresh to 0 bypasses rate limiting for testing.
|
|
|
|
Thread-safe: uses _refresh_lock to protect _last_refresh access.
|
|
"""
|
|
# Calculate refresh interval: max(timeout // 10, 1)
|
|
refresh_interval = max(self.timeout // 10, 1)
|
|
current_time = time.time()
|
|
|
|
# Check if we're within the rate limit period (thread-safe read)
|
|
# _last_refresh == 0 forces a refresh (bypasses rate limiting for testing)
|
|
with self._refresh_lock:
|
|
last_refresh = self._last_refresh
|
|
is_rate_limited = (
|
|
last_refresh > 0 and (current_time - last_refresh) < refresh_interval
|
|
)
|
|
|
|
try:
|
|
# Always verify lock existence, even during rate limiting
|
|
current_value = self.redis.get(self.key)
|
|
if not current_value:
|
|
with self._refresh_lock:
|
|
self._last_refresh = 0
|
|
return False
|
|
|
|
stored_owner = (
|
|
current_value.decode("utf-8")
|
|
if isinstance(current_value, bytes)
|
|
else str(current_value)
|
|
)
|
|
if stored_owner != self.owner_id:
|
|
with self._refresh_lock:
|
|
self._last_refresh = 0
|
|
return False
|
|
|
|
# If rate limited, return True but don't update TTL or timestamp
|
|
if is_rate_limited:
|
|
return True
|
|
|
|
# Perform actual refresh
|
|
if self.redis.expire(self.key, self.timeout):
|
|
with self._refresh_lock:
|
|
self._last_refresh = current_time
|
|
return True
|
|
|
|
with self._refresh_lock:
|
|
self._last_refresh = 0
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"ClusterLock.refresh failed for key {self.key}: {e}")
|
|
with self._refresh_lock:
|
|
self._last_refresh = 0
|
|
return False
|
|
|
|
def release(self):
|
|
"""Release the lock."""
|
|
with self._refresh_lock:
|
|
if self._last_refresh == 0:
|
|
return
|
|
|
|
try:
|
|
self.redis.delete(self.key)
|
|
except Exception:
|
|
pass
|
|
|
|
with self._refresh_lock:
|
|
self._last_refresh = 0.0
|