feat(runtime): use prlimit to limit resource usage of command to avoid OOM Runtime Kill (#6338)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang
2025-02-10 22:21:11 -05:00
committed by GitHub
parent 1a715d2ec4
commit 6a6dc93e03
5 changed files with 151 additions and 43 deletions

View File

@@ -21,6 +21,7 @@ from contextlib import asynccontextmanager
from pathlib import Path
from zipfile import ZipFile
import psutil
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, StreamingResponse
@@ -108,6 +109,22 @@ class ActionExecutor:
self.last_execution_time = self.start_time
self._initialized = False
if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
self.max_memory_gb = int(_override_max_memory_gb)
logger.info(
f'Setting max memory to {self.max_memory_gb}GB (according to the RUNTIME_MAX_MEMORY_GB environment variable)'
)
else:
# Get available system memory
total_memory_gb = psutil.virtual_memory().total / (
1024 * 1024 * 1024
) # Convert to GB
self.max_memory_gb = int(max(0.5, total_memory_gb - 1.0))
# Reserve 1GB as head room, minimum of 0.5GB
logger.info(
f'Total memory: {total_memory_gb}GB, setting limit to {self.max_memory_gb}GB (reserved 1GB for action execution server, minimum 0.5GB)'
)
@property
def initial_cwd(self):
return self._initial_cwd
@@ -120,8 +137,10 @@ class ActionExecutor:
no_change_timeout_seconds=int(
os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30)
),
max_memory_mb=self.max_memory_gb * 1024,
)
self.bash_session.initialize()
await wait_all(
(self._init_plugin(plugin) for plugin in self.plugins_to_load),
timeout=30,

View File

@@ -212,13 +212,17 @@ class RemoteRuntime(ActionExecutionClient):
plugins=self.plugins,
app_config=self.config,
)
environment = {
'DEBUG': 'true'
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
else {},
}
environment.update(self.config.sandbox.runtime_startup_env_vars)
start_request = {
'image': self.container_image,
'command': command,
'working_dir': '/openhands/code/',
'environment': {'DEBUG': 'true'}
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
else {},
'environment': environment,
'session_id': self.sid,
'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
}

View File

@@ -175,25 +175,32 @@ class BashSession:
work_dir: str,
username: str | None = None,
no_change_timeout_seconds: int = 30,
max_memory_mb: int | None = None,
):
self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
self.work_dir = work_dir
self.username = username
self._initialized = False
self.max_memory_mb = max_memory_mb
def initialize(self):
self.server = libtmux.Server()
window_command = '/bin/bash'
_shell_command = '/bin/bash'
if self.username in ['root', 'openhands']:
# This starts a non-login (new) shell for the given user
window_command = f'su {self.username} -'
_shell_command = f'su {self.username} -'
# otherwise, we are running as the CURRENT USER (e.g., when running LocalRuntime)
if self.max_memory_mb is not None:
window_command = (
f'prlimit --as={self.max_memory_mb * 1024 * 1024} {_shell_command}'
)
else:
window_command = _shell_command
logger.debug(f'Initializing bash session with command: {window_command}')
session_name = f'openhands-{self.username}-{uuid.uuid4()}'
self.session = self.server.new_session(
session_name=session_name,
window_name='bash',
window_command=window_command,
start_directory=self.work_dir,
kill_session=True,
x=1000,
@@ -207,6 +214,7 @@ class BashSession:
# We need to create a new pane because the initial pane's history limit is (default) 2000
_initial_window = self.session.attached_window
self.window = self.session.new_window(
window_name='bash',
window_shell=window_command,
start_directory=self.work_dir,
)

View File

@@ -0,0 +1,113 @@
"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
from conftest import _close_test_runtime, _load_runtime
from openhands.core.logger import openhands_logger as logger
from openhands.events.action import CmdRunAction
def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
},
)
action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
for _ in range(repeat):
# run stress-ng stress tests for 1 minute
action = CmdRunAction(command='stress-ng --all 1 -t 30s')
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
_close_test_runtime(runtime)
def test_stress_docker_runtime_hit_memory_limits(temp_dir, runtime_cls):
"""Test runtime behavior under resource constraints."""
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
},
runtime_startup_env_vars={
'RUNTIME_MAX_MEMORY_GB': '3',
},
)
action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert 'aborted early, out of system resources' in obs.content
assert obs.exit_code == 3 # OOM killed!
_close_test_runtime(runtime)
def test_stress_docker_runtime_within_memory_limits(temp_dir, runtime_cls):
"""Test runtime behavior under resource constraints."""
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
},
runtime_startup_env_vars={
'RUNTIME_MAX_MEMORY_GB': '7',
},
)
action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
_close_test_runtime(runtime)

View File

@@ -1,36 +0,0 @@
"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
from conftest import _close_test_runtime, _load_runtime
from openhands.core.logger import openhands_logger as logger
from openhands.events.action import CmdRunAction
def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
},
)
action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
for _ in range(repeat):
# run stress-ng stress tests for 1 minute
action = CmdRunAction(command='stress-ng --all 1 -t 1m')
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
_close_test_runtime(runtime)