feat(backend): Add retry on executor process initialization (#9865)

Executor process initialization can fail and cause this error:
```
concurrent.futures.process.BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore
```

### Changes 🏗️

Add retry to reduce the chance of the initialization error to happen.

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  - [x] Existing tests
This commit is contained in:
Zamil Majdy
2025-04-23 23:24:17 +02:00
committed by GitHub
parent 160a622ba4
commit 1e3236a041
3 changed files with 12 additions and 6 deletions

View File

@@ -20,7 +20,6 @@ from prisma.types import (
CreditTransactionCreateInput,
CreditTransactionWhereInput,
)
from tenacity import retry, stop_after_attempt, wait_exponential
from backend.data import db
from backend.data.block_cost_config import BLOCK_COSTS
@@ -36,6 +35,7 @@ from backend.data.user import get_user_by_id
from backend.executor.utils import UsageTransactionMetadata
from backend.notifications import NotificationManager
from backend.util.exceptions import InsufficientBalanceError
from backend.util.retry import func_retry
from backend.util.service import get_service_client
from backend.util.settings import Settings
@@ -262,11 +262,7 @@ class UserCreditBase(ABC):
)
return transaction_balance, transaction_time
@retry(
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=1, max=10),
reraise=True,
)
@func_retry
async def _enable_transaction(
self,
transaction_key: str,

View File

@@ -61,6 +61,7 @@ from backend.util.decorator import error_logged, time_measured
from backend.util.file import clean_exec_files
from backend.util.logging import configure_logging
from backend.util.process import AppProcess, set_service_name
from backend.util.retry import func_retry
from backend.util.service import close_service_client, get_service_client
from backend.util.settings import Settings
@@ -422,6 +423,7 @@ class Executor:
"""
@classmethod
@func_retry
def on_node_executor_start(cls):
configure_logging()
set_service_name("NodeExecutor")
@@ -527,6 +529,7 @@ class Executor:
stats.error = e
@classmethod
@func_retry
def on_graph_executor_start(cls):
configure_logging()
set_service_name("GraphExecutor")

View File

@@ -73,3 +73,10 @@ def conn_retry(
return async_wrapper if is_coroutine else sync_wrapper
return decorator
func_retry = retry(
reraise=False,
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=1, max=30),
)