mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 07:18:10 -05:00
feat(runtime): use prlimit to limit resource usage of command to avoid OOM Runtime Kill (#6338)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
|
||||
import psutil
|
||||
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
@@ -108,6 +109,22 @@ class ActionExecutor:
|
||||
self.last_execution_time = self.start_time
|
||||
self._initialized = False
|
||||
|
||||
if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
|
||||
self.max_memory_gb = int(_override_max_memory_gb)
|
||||
logger.info(
|
||||
f'Setting max memory to {self.max_memory_gb}GB (according to the RUNTIME_MAX_MEMORY_GB environment variable)'
|
||||
)
|
||||
else:
|
||||
# Get available system memory
|
||||
total_memory_gb = psutil.virtual_memory().total / (
|
||||
1024 * 1024 * 1024
|
||||
) # Convert to GB
|
||||
self.max_memory_gb = int(max(0.5, total_memory_gb - 1.0))
|
||||
# Reserve 1GB as head room, minimum of 0.5GB
|
||||
logger.info(
|
||||
f'Total memory: {total_memory_gb}GB, setting limit to {self.max_memory_gb}GB (reserved 1GB for action execution server, minimum 0.5GB)'
|
||||
)
|
||||
|
||||
@property
|
||||
def initial_cwd(self):
|
||||
return self._initial_cwd
|
||||
@@ -120,8 +137,10 @@ class ActionExecutor:
|
||||
no_change_timeout_seconds=int(
|
||||
os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30)
|
||||
),
|
||||
max_memory_mb=self.max_memory_gb * 1024,
|
||||
)
|
||||
self.bash_session.initialize()
|
||||
|
||||
await wait_all(
|
||||
(self._init_plugin(plugin) for plugin in self.plugins_to_load),
|
||||
timeout=30,
|
||||
|
||||
@@ -212,13 +212,17 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
plugins=self.plugins,
|
||||
app_config=self.config,
|
||||
)
|
||||
environment = {
|
||||
'DEBUG': 'true'
|
||||
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
|
||||
else {},
|
||||
}
|
||||
environment.update(self.config.sandbox.runtime_startup_env_vars)
|
||||
start_request = {
|
||||
'image': self.container_image,
|
||||
'command': command,
|
||||
'working_dir': '/openhands/code/',
|
||||
'environment': {'DEBUG': 'true'}
|
||||
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
|
||||
else {},
|
||||
'environment': environment,
|
||||
'session_id': self.sid,
|
||||
'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
|
||||
}
|
||||
|
||||
@@ -175,25 +175,32 @@ class BashSession:
|
||||
work_dir: str,
|
||||
username: str | None = None,
|
||||
no_change_timeout_seconds: int = 30,
|
||||
max_memory_mb: int | None = None,
|
||||
):
|
||||
self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
|
||||
self.work_dir = work_dir
|
||||
self.username = username
|
||||
self._initialized = False
|
||||
self.max_memory_mb = max_memory_mb
|
||||
|
||||
def initialize(self):
|
||||
self.server = libtmux.Server()
|
||||
window_command = '/bin/bash'
|
||||
_shell_command = '/bin/bash'
|
||||
if self.username in ['root', 'openhands']:
|
||||
# This starts a non-login (new) shell for the given user
|
||||
window_command = f'su {self.username} -'
|
||||
_shell_command = f'su {self.username} -'
|
||||
# otherwise, we are running as the CURRENT USER (e.g., when running LocalRuntime)
|
||||
if self.max_memory_mb is not None:
|
||||
window_command = (
|
||||
f'prlimit --as={self.max_memory_mb * 1024 * 1024} {_shell_command}'
|
||||
)
|
||||
else:
|
||||
window_command = _shell_command
|
||||
|
||||
logger.debug(f'Initializing bash session with command: {window_command}')
|
||||
session_name = f'openhands-{self.username}-{uuid.uuid4()}'
|
||||
self.session = self.server.new_session(
|
||||
session_name=session_name,
|
||||
window_name='bash',
|
||||
window_command=window_command,
|
||||
start_directory=self.work_dir,
|
||||
kill_session=True,
|
||||
x=1000,
|
||||
@@ -207,6 +214,7 @@ class BashSession:
|
||||
# We need to create a new pane because the initial pane's history limit is (default) 2000
|
||||
_initial_window = self.session.attached_window
|
||||
self.window = self.session.new_window(
|
||||
window_name='bash',
|
||||
window_shell=window_command,
|
||||
start_directory=self.work_dir,
|
||||
)
|
||||
|
||||
113
tests/runtime/test_runtime_resource.py
Normal file
113
tests/runtime/test_runtime_resource.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
|
||||
|
||||
from conftest import _close_test_runtime, _load_runtime
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.events.action import CmdRunAction
|
||||
|
||||
|
||||
def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
|
||||
runtime, config = _load_runtime(
|
||||
temp_dir,
|
||||
runtime_cls,
|
||||
docker_runtime_kwargs={
|
||||
'cpu_period': 100000, # 100ms
|
||||
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
|
||||
'mem_limit': '4G', # 4 GB of memory
|
||||
},
|
||||
)
|
||||
|
||||
action = CmdRunAction(
|
||||
command='sudo apt-get update && sudo apt-get install -y stress-ng'
|
||||
)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
for _ in range(repeat):
|
||||
# run stress-ng stress tests for 1 minute
|
||||
action = CmdRunAction(command='stress-ng --all 1 -t 30s')
|
||||
action.set_hard_timeout(120)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_stress_docker_runtime_hit_memory_limits(temp_dir, runtime_cls):
|
||||
"""Test runtime behavior under resource constraints."""
|
||||
runtime, config = _load_runtime(
|
||||
temp_dir,
|
||||
runtime_cls,
|
||||
docker_runtime_kwargs={
|
||||
'cpu_period': 100000, # 100ms
|
||||
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
|
||||
'mem_limit': '4G', # 4 GB of memory
|
||||
'memswap_limit': '0', # No swap
|
||||
'mem_swappiness': 0, # Disable swapping
|
||||
'oom_kill_disable': False, # Enable OOM killer
|
||||
},
|
||||
runtime_startup_env_vars={
|
||||
'RUNTIME_MAX_MEMORY_GB': '3',
|
||||
},
|
||||
)
|
||||
|
||||
action = CmdRunAction(
|
||||
command='sudo apt-get update && sudo apt-get install -y stress-ng'
|
||||
)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
action = CmdRunAction(
|
||||
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
|
||||
)
|
||||
action.set_hard_timeout(120)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert 'aborted early, out of system resources' in obs.content
|
||||
assert obs.exit_code == 3 # OOM killed!
|
||||
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_stress_docker_runtime_within_memory_limits(temp_dir, runtime_cls):
|
||||
"""Test runtime behavior under resource constraints."""
|
||||
runtime, config = _load_runtime(
|
||||
temp_dir,
|
||||
runtime_cls,
|
||||
docker_runtime_kwargs={
|
||||
'cpu_period': 100000, # 100ms
|
||||
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
|
||||
'mem_limit': '4G', # 4 GB of memory
|
||||
'memswap_limit': '0', # No swap
|
||||
'mem_swappiness': 0, # Disable swapping
|
||||
'oom_kill_disable': False, # Enable OOM killer
|
||||
},
|
||||
runtime_startup_env_vars={
|
||||
'RUNTIME_MAX_MEMORY_GB': '7',
|
||||
},
|
||||
)
|
||||
|
||||
action = CmdRunAction(
|
||||
command='sudo apt-get update && sudo apt-get install -y stress-ng'
|
||||
)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
action = CmdRunAction(
|
||||
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
|
||||
)
|
||||
action.set_hard_timeout(120)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
_close_test_runtime(runtime)
|
||||
@@ -1,36 +0,0 @@
|
||||
"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
|
||||
|
||||
from conftest import _close_test_runtime, _load_runtime
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.events.action import CmdRunAction
|
||||
|
||||
|
||||
def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
|
||||
runtime, config = _load_runtime(
|
||||
temp_dir,
|
||||
runtime_cls,
|
||||
docker_runtime_kwargs={
|
||||
'cpu_period': 100000, # 100ms
|
||||
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
|
||||
'mem_limit': '4G', # 4 GB of memory
|
||||
},
|
||||
)
|
||||
|
||||
action = CmdRunAction(
|
||||
command='sudo apt-get update && sudo apt-get install -y stress-ng'
|
||||
)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
for _ in range(repeat):
|
||||
# run stress-ng stress tests for 1 minute
|
||||
action = CmdRunAction(command='stress-ng --all 1 -t 1m')
|
||||
action.set_hard_timeout(120)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
_close_test_runtime(runtime)
|
||||
Reference in New Issue
Block a user