mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(backend/copilot): enable E2B auto_resume and reduce safety-net timeout (#12397)
Enable E2B `auto_resume` lifecycle option and reduce the safety-net timeout from 3 hours to 5 minutes. Currently, if the explicit per-turn `pause_sandbox_direct()` call fails (process crash, network issue, fire-and-forget task cancellation), the sandbox keeps running for up to **3 hours** before the safety-net timeout fires. With this change, worst-case billing drops to **5 minutes**. ### Changes - Add `auto_resume: True` to sandbox lifecycle config — paused sandboxes wake transparently on SDK activity - Reduce `e2b_sandbox_timeout` default from 10800s (3h) → 300s (5min) - Add `e2b_sandbox_auto_resume` config field (default: `True`) - Guard: `auto_resume` only added when `on_timeout == "pause"` ### What doesn't change - Explicit per-turn `pause_sandbox_direct()` remains the primary mechanism - `connect()` / `_try_reconnect()` flow unchanged - Redis key management unchanged - No latency impact (resume is ~1-2s regardless of trigger) ### Risk Very low — `auto_resume` is additive. If it doesn't work as advertised, `connect()` still resumes paused sandboxes exactly as before. Ref: https://e2b.dev/docs/sandbox/auto-resume Linear: SECRT-2118 --- Co-authored-by: Zamil Majdy (@majdyz) <zamil.majdy@agpt.co>
This commit is contained in:
@@ -115,7 +115,7 @@ class ChatConfig(BaseSettings):
|
||||
description="E2B sandbox template to use for copilot sessions.",
|
||||
)
|
||||
e2b_sandbox_timeout: int = Field(
|
||||
default=10800, # 3 hours — wall-clock timeout, not idle; explicit pause is primary
|
||||
default=300, # 5 min safety net — explicit per-turn pause is the primary mechanism
|
||||
description="E2B sandbox running-time timeout (seconds). "
|
||||
"E2B timeout is wall-clock (not idle). Explicit per-turn pause is the primary "
|
||||
"mechanism; this is the safety net.",
|
||||
|
||||
@@ -21,9 +21,11 @@ Lifecycle
|
||||
Cost control
|
||||
------------
|
||||
Sandboxes are created with a configurable ``on_timeout`` lifecycle action
|
||||
(default: ``"pause"``). The explicit per-turn ``pause_sandbox()`` call is the
|
||||
primary mechanism; the lifecycle setting is a safety net. Paused sandboxes are
|
||||
free.
|
||||
(default: ``"pause"``) and ``auto_resume`` (default: ``True``). The explicit
|
||||
per-turn ``pause_sandbox()`` call is the primary mechanism; the lifecycle
|
||||
timeout is a safety net (default: 5 min). ``auto_resume`` ensures that paused
|
||||
sandboxes wake transparently on SDK activity, making the aggressive safety-net
|
||||
timeout safe. Paused sandboxes are free.
|
||||
|
||||
The sandbox_id is stored in Redis. The same key doubles as a creation lock:
|
||||
a ``"creating"`` sentinel value is written with a short TTL while a new sandbox
|
||||
@@ -40,6 +42,7 @@ import logging
|
||||
from typing import Any, Awaitable, Callable, Literal
|
||||
|
||||
from e2b import AsyncSandbox
|
||||
from e2b.sandbox.sandbox_api import SandboxLifecycle
|
||||
|
||||
from backend.data.redis_client import get_redis_async
|
||||
|
||||
@@ -116,9 +119,10 @@ async def get_or_create_sandbox(
|
||||
removes the need for a separate lock key.
|
||||
|
||||
*timeout* controls how long the e2b sandbox may run continuously before
|
||||
the ``on_timeout`` lifecycle rule fires (default: 3 h).
|
||||
the ``on_timeout`` lifecycle rule fires (default: 5 min).
|
||||
*on_timeout* controls what happens on timeout: ``"pause"`` (default, free)
|
||||
or ``"kill"``.
|
||||
or ``"kill"``. When ``"pause"``, ``auto_resume`` is enabled so paused
|
||||
sandboxes wake transparently on SDK activity.
|
||||
"""
|
||||
redis = await get_redis_async()
|
||||
key = _sandbox_key(session_id)
|
||||
@@ -156,11 +160,15 @@ async def get_or_create_sandbox(
|
||||
|
||||
# We hold the slot — create the sandbox.
|
||||
try:
|
||||
lifecycle = SandboxLifecycle(
|
||||
on_timeout=on_timeout,
|
||||
auto_resume=on_timeout == "pause",
|
||||
)
|
||||
sandbox = await AsyncSandbox.create(
|
||||
template=template,
|
||||
api_key=api_key,
|
||||
timeout=timeout,
|
||||
lifecycle={"on_timeout": on_timeout},
|
||||
lifecycle=lifecycle,
|
||||
)
|
||||
try:
|
||||
await _set_stored_sandbox_id(session_id, sandbox.sandbox_id)
|
||||
|
||||
@@ -157,14 +157,17 @@ class TestGetOrCreateSandbox:
|
||||
|
||||
assert result is new_sb
|
||||
mock_cls.create.assert_awaited_once()
|
||||
# Verify lifecycle param is set
|
||||
# Verify lifecycle: pause + auto_resume enabled
|
||||
_, kwargs = mock_cls.create.call_args
|
||||
assert kwargs.get("lifecycle") == {"on_timeout": "pause"}
|
||||
assert kwargs.get("lifecycle") == {
|
||||
"on_timeout": "pause",
|
||||
"auto_resume": True,
|
||||
}
|
||||
# sandbox_id should be saved to Redis
|
||||
redis.set.assert_awaited()
|
||||
|
||||
def test_create_with_on_timeout_kill(self):
|
||||
"""on_timeout='kill' is passed through to AsyncSandbox.create."""
|
||||
"""on_timeout='kill' disables auto_resume automatically."""
|
||||
new_sb = _mock_sandbox("sb-new")
|
||||
redis = _mock_redis(set_nx_result=True, stored_sandbox_id=None)
|
||||
with (
|
||||
@@ -179,7 +182,10 @@ class TestGetOrCreateSandbox:
|
||||
)
|
||||
|
||||
_, kwargs = mock_cls.create.call_args
|
||||
assert kwargs.get("lifecycle") == {"on_timeout": "kill"}
|
||||
assert kwargs.get("lifecycle") == {
|
||||
"on_timeout": "kill",
|
||||
"auto_resume": False,
|
||||
}
|
||||
|
||||
def test_create_failure_releases_slot(self):
|
||||
"""If sandbox creation fails, the Redis creation slot is deleted."""
|
||||
|
||||
8
autogpt_platform/backend/poetry.lock
generated
8
autogpt_platform/backend/poetry.lock
generated
@@ -1282,14 +1282,14 @@ pgp = ["gpg"]
|
||||
|
||||
[[package]]
|
||||
name = "e2b"
|
||||
version = "2.15.1"
|
||||
version = "2.15.2"
|
||||
description = "E2B SDK that give agents cloud environments"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "e2b-2.15.1-py3-none-any.whl", hash = "sha256:a3bc4e004eab51fb05bae44e9ee4fe821e4637260f4ce3064c8f7c6ed7f5a2a0"},
|
||||
{file = "e2b-2.15.1.tar.gz", hash = "sha256:a4f1bbc8b5180a8a1098079257fcb73e42503ed546098f676f722f11f0d68c09"},
|
||||
{file = "e2b-2.15.2-py3-none-any.whl", hash = "sha256:19a56fbdea25974dc81426ed48337eae6cea91d404f5bcf8861a5a2c6e4d982a"},
|
||||
{file = "e2b-2.15.2.tar.gz", hash = "sha256:414379d2421d6827eeb2eb50a4d6b3fdb7d691b39ff73b5ea05ca4b532819831"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -8882,4 +8882,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.14"
|
||||
content-hash = "618d61b0586ab82fec1e28d1feb549a198e0b5c9d152e808862e55efc00a65b9"
|
||||
content-hash = "4e4365721cd3b68c58c237353b74adae1c64233fd4446904c335f23eb866fdca"
|
||||
|
||||
@@ -20,7 +20,7 @@ claude-agent-sdk = "0.1.45" # see copilot/sdk/sdk_compat_test.py for capability
|
||||
click = "^8.2.0"
|
||||
cryptography = "^46.0"
|
||||
discord-py = "^2.5.2"
|
||||
e2b = "^2.0"
|
||||
e2b = "^2.15.2"
|
||||
e2b-code-interpreter = "^2.0"
|
||||
elevenlabs = "^1.50.0"
|
||||
fastapi = "^0.128.6"
|
||||
|
||||
Reference in New Issue
Block a user