Compare commits

...

33 Commits

Author SHA1 Message Date
Xingyao Wang
fd1414f7d6 Merge branch 'main' into fix-runtime-error-handling 2025-06-12 17:27:40 -04:00
openhands
9d4b0cc29b Merge runtime error handling tests into a single file 2025-06-03 19:13:07 +00:00
Xingyao Wang
c3e272cdf5 Merge branch 'main' into fix-runtime-error-handling 2025-06-03 15:06:19 -04:00
openhands
4c9c501ad0 Add simplified tests for resetting _last_updated_mcp_stdio_servers on runtime errors 2025-06-03 18:44:57 +00:00
openhands
1277b5a67c Add comprehensive tests for resetting _last_updated_mcp_stdio_servers on runtime errors 2025-06-03 18:42:28 +00:00
openhands
22e29885a1 Add tests for resetting _last_updated_mcp_stdio_servers on runtime errors 2025-06-03 18:39:34 +00:00
openhands
b0a53e6ab5 Reset _last_updated_mcp_stdio_servers to empty list when error happens 2025-06-03 18:15:17 +00:00
openhands
716c1ec5b7 Refactor runtime error handling to use tenacity instead of while loop 2025-06-03 15:20:37 +00:00
openhands
7ea2763fa2 Simplify runtime error handling in base.py and remove agent controller changes 2025-06-03 14:39:59 +00:00
openhands
ca2d9dece1 Move runtime error handling logic from agent_controller to runtime/base.py 2025-06-03 14:24:13 +00:00
Xingyao Wang
e450e126f9 Merge branch 'main' into fix-runtime-error-handling 2025-06-03 10:11:22 -04:00
Xingyao Wang
3c79f06dfa Merge branch 'main' into fix-runtime-error-handling 2025-05-23 23:54:22 +08:00
openhands
79816cf582 Merge main into fix-runtime-error-handling 2025-05-19 02:02:56 +00:00
openhands
93cce89313 Fix runtime error handling and linter issues 2025-04-29 14:28:05 +00:00
openhands
6cb7066900 Merge main into fix-runtime-error-handling 2025-04-29 02:13:25 +00:00
openhands
531603c391 Fix linting issues in test_agent_controller.py 2025-03-19 21:26:51 +00:00
openhands
20d51944a2 Merge main into fix-runtime-error-handling and resolve merge conflicts 2025-03-19 21:20:01 +00:00
openhands
044cd4fbab Fix docstring linting issues 2025-03-18 17:14:34 +00:00
openhands
3aa9f40fd3 Fix linting issues in agent_controller.py 2025-03-18 17:12:42 +00:00
openhands
cd12e465cd Merge main into fix-runtime-error-handling, resolving conflicts 2025-03-18 17:02:58 +00:00
Xingyao Wang
1c0d800041 Merge branch 'main' into fix-runtime-error-handling 2025-03-17 21:05:13 -04:00
openhands
d113abbd8b Fix linting issues with docstrings and formatting 2025-03-17 18:33:40 +00:00
openhands
bd05a4b2e1 Fix tests to match updated error message 2025-03-17 17:15:14 +00:00
openhands
20cc2538e9 Remove redundant test_runtime_error_handling.py file 2025-03-17 16:57:40 +00:00
Xingyao Wang
dbfc471490 Update openhands/controller/agent_controller.py 2025-03-18 00:56:31 +08:00
openhands
922341c3f1 Simplify test by removing counter reset test 2025-03-17 16:55:18 +00:00
openhands
129989dd09 Fix test for counter reset functionality 2025-03-17 16:54:40 +00:00
openhands
0a39bb83b1 Fix test assertions for event source 2025-03-17 16:53:51 +00:00
openhands
dcf9e9f559 Fix EventSource.SYSTEM to EventSource.ENVIRONMENT 2025-03-17 16:52:57 +00:00
openhands
8246d6bcb8 Add missing imports for runtime error exceptions 2025-03-17 16:51:54 +00:00
openhands
301ddeb4e9 Add proper unit tests for runtime error handling in agent controller 2025-03-17 16:51:01 +00:00
openhands
649acd3d9c Remove _try_migrate_workspace method as requested 2025-03-17 16:48:26 +00:00
openhands
4f33f0e35f Fix #6032: Better handling of Critical Runtime Error
- Add retry mechanism for runtime errors with a maximum of 3 retries
- Add workspace migration functionality to recover from failed runtimes
- Add informative error messages to the agent with retry count information
- Reset retry counter on successful steps
- Add comprehensive tests for the implementation
2025-03-17 16:45:59 +00:00
3 changed files with 543 additions and 12 deletions

View File

@@ -58,6 +58,7 @@ class SandboxConfig(BaseModel):
remote_runtime_init_timeout: int = Field(default=180)
remote_runtime_api_timeout: int = Field(default=10)
remote_runtime_enable_retries: bool = Field(default=True)
retry_on_unrecoverable_runtime_error: bool = Field(default=False)
remote_runtime_class: str | None = Field(
default=None
) # can be "None" (default to gvisor) or "sysbox" (support docker inside runtime + more stable)

View File

@@ -14,10 +14,15 @@ from typing import Callable, cast
from zipfile import ZipFile
import httpx
import tenacity
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from openhands.core.config import OpenHandsConfig, SandboxConfig
from openhands.core.config.mcp_config import MCPConfig, MCPStdioServerConfig
from openhands.core.exceptions import AgentRuntimeDisconnectedError
from openhands.core.exceptions import (
AgentRuntimeDisconnectedError,
AgentRuntimeUnavailableError,
)
from openhands.core.logger import openhands_logger as logger
from openhands.events import EventSource, EventStream, EventStreamSubscriber
from openhands.events.action import (
@@ -333,22 +338,134 @@ class Runtime(FileEditRuntimeMixin):
f'Failed export latest github token to runtime: {self.sid}, {e}'
)
async def _handle_runtime_error(
self,
event: Action,
error: Exception,
retry_count: int,
max_retries: int = 3,
retry_delay: int = 10,
) -> None:
"""
Handle runtime-related errors with retry logic.
Args:
event: The action that caused the error
error: The exception that was raised
retry_count: Current retry attempt number
max_retries: Maximum number of retry attempts
retry_delay: Delay in seconds between retries
Returns:
None
"""
error_message = f'{type(error).__name__}: {str(error)}'
self.log('error', f'Runtime error while running action: {error_message}')
self.log('error', f'Problematic action: {str(event)}')
# Reset MCP stdio servers tracking when error happens
if hasattr(self, '_last_updated_mcp_stdio_servers'):
from openhands.core.config.mcp_config import MCPStdioServerConfig
self._last_updated_mcp_stdio_servers: list[MCPStdioServerConfig] = []
self.log(
'debug',
'Reset _last_updated_mcp_stdio_servers to empty list due to runtime error',
)
# Create error message for the observation
error_content = (
f'Your command may have consumed too much resources, and the previous runtime died. '
f'You are connected to a new runtime container, all dependencies you have installed '
f'outside /workspace may not be persisted. (Retry {retry_count} of {max_retries})'
)
# Create an error observation
observation = ErrorObservation(content=error_content)
# Add the observation to the event stream
observation._cause = event.id # type: ignore[attr-defined]
observation.tool_call_metadata = event.tool_call_metadata
self.event_stream.add_event(observation, EventSource.ENVIRONMENT) # type: ignore[arg-type]
# Log the retry attempt
self.log(
'warning',
f'Runtime error occurred. Retry {retry_count} of {max_retries}.',
)
async def _execute_action_core(self, event: Action) -> Observation:
"""
Core logic for executing an action.
Args:
event: The action to execute
Returns:
The observation resulting from the action
"""
await self._export_latest_git_provider_tokens(event)
if isinstance(event, MCPAction):
observation: Observation = await self.call_tool_mcp(event)
else:
observation = await call_sync_from_async(self.run_action, event)
return observation
async def _handle_action(self, event: Action) -> None:
if event.timeout is None:
# We don't block the command if this is a default timeout action
event.set_hard_timeout(self.config.sandbox.timeout, blocking=False)
assert event.timeout is not None
# Define a before_sleep callback for tenacity
async def before_sleep_callback(retry_state: tenacity.RetryCallState) -> None:
exception = retry_state.outcome.exception()
if exception:
await self._handle_runtime_error(
event,
exception,
retry_state.attempt_number,
max_retries=3,
retry_delay=10,
)
# Create a retry decorator based on configuration
if self.config.sandbox.retry_on_unrecoverable_runtime_error:
retry_decorator = retry(
retry=retry_if_exception_type(
(AgentRuntimeDisconnectedError, AgentRuntimeUnavailableError)
),
stop=stop_after_attempt(3),
wait=wait_fixed(10),
before_sleep=before_sleep_callback,
reraise=True,
)
execute_with_retry = retry_decorator(self._execute_action_core)
else:
# No retry if not enabled in config
execute_with_retry = self._execute_action_core
try:
await self._export_latest_git_provider_tokens(event)
if isinstance(event, MCPAction):
observation: Observation = await self.call_tool_mcp(event)
else:
observation = await call_sync_from_async(self.run_action, event)
# Execute the action with retry if configured
observation: Observation = await execute_with_retry(event)
# Set observation metadata
observation._cause = event.id # type: ignore[attr-defined]
observation.tool_call_metadata = event.tool_call_metadata
except (AgentRuntimeDisconnectedError, AgentRuntimeUnavailableError) as e:
# This will only be reached if retries are disabled or all retries failed
err_id = 'STATUS$ERROR_RUNTIME_DISCONNECTED'
error_message = f'{type(e).__name__}: {str(e)}'
self.log('error', f'Runtime error while running action: {error_message}')
self.log('error', f'Problematic action: {str(event)}')
self.send_error_message(err_id, error_message)
return
except Exception as e:
# Handle other exceptions
err_id = ''
if isinstance(e, httpx.NetworkError) or isinstance(
e, AgentRuntimeDisconnectedError
):
if isinstance(e, httpx.NetworkError):
err_id = 'STATUS$ERROR_RUNTIME_DISCONNECTED'
error_message = f'{type(e).__name__}: {str(e)}'
self.log('error', f'Unexpected error while running action: {error_message}')
@@ -356,9 +473,6 @@ class Runtime(FileEditRuntimeMixin):
self.send_error_message(err_id, error_message)
return
observation._cause = event.id # type: ignore[attr-defined]
observation.tool_call_metadata = event.tool_call_metadata
# this might be unnecessary, since source should be set by the event stream when we're here
source = event.source if event.source else EventSource.AGENT
if isinstance(observation, NullObservation):

View File

@@ -0,0 +1,416 @@
from unittest.mock import AsyncMock, Mock, patch
import httpx
import pytest
from openhands.core.config.mcp_config import MCPStdioServerConfig
from openhands.core.exceptions import (
AgentRuntimeDisconnectedError,
AgentRuntimeTimeoutError,
)
from openhands.events.action import CmdRunAction, MCPAction
from openhands.events.event import EventSource
from openhands.events.observation import ErrorObservation, Observation
from openhands.runtime.base import Runtime
class TestRuntimeErrorHandling:
"""Tests for runtime error handling functionality."""
@pytest.fixture
def mock_runtime(self):
"""Create a mock Runtime with necessary attributes."""
runtime = Mock(spec=Runtime)
runtime._last_updated_mcp_stdio_servers = [
MCPStdioServerConfig(name='test-server-1', command='test-command-1'),
MCPStdioServerConfig(name='test-server-2', command='test-command-2'),
]
runtime.log = Mock()
runtime.event_stream = Mock()
runtime.event_stream.add_event = AsyncMock()
runtime.send_error_message = Mock()
runtime.config = Mock()
runtime.config.sandbox = Mock()
return runtime
@pytest.mark.asyncio
async def test_handle_runtime_error_resets_mcp_servers(self, mock_runtime):
"""Test that _handle_runtime_error resets _last_updated_mcp_stdio_servers."""
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Patch the id property to return a valid integer
with patch(
'openhands.events.action.commands.CmdRunAction.id',
new_callable=Mock,
return_value=12345,
):
# Call the error handling method directly
await Runtime._handle_runtime_error(
mock_runtime,
action,
AgentRuntimeTimeoutError('Runtime timeout'),
retry_count=1,
max_retries=3,
)
# Verify _last_updated_mcp_stdio_servers was reset
assert mock_runtime._last_updated_mcp_stdio_servers == []
# Verify log message was called
mock_runtime.log.assert_any_call(
'debug',
'Reset _last_updated_mcp_stdio_servers to empty list due to runtime error',
)
# Verify an error observation was added to the event stream
mock_runtime.event_stream.add_event.assert_called_once()
# Get the observation that was added
call_args = mock_runtime.event_stream.add_event.call_args[0]
observation = call_args[0]
source = call_args[1]
# Verify it's an ErrorObservation with the right source
assert isinstance(observation, ErrorObservation)
assert source == EventSource.ENVIRONMENT
# Verify the error message contains the standard runtime error text
assert (
'Your command may have consumed too much resources'
in observation.content
)
assert 'Retry 1 of 3' in observation.content
@pytest.mark.asyncio
async def test_handle_runtime_error_on_disconnected(self, mock_runtime):
"""Test that _handle_runtime_error handles disconnected errors correctly."""
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Patch the id property to return a valid integer
with patch(
'openhands.events.action.commands.CmdRunAction.id',
new_callable=Mock,
return_value=12345,
):
# Call the error handling method directly
await Runtime._handle_runtime_error(
mock_runtime,
action,
AgentRuntimeDisconnectedError('Runtime disconnected'),
retry_count=2,
max_retries=3,
)
# Verify _last_updated_mcp_stdio_servers was reset
assert mock_runtime._last_updated_mcp_stdio_servers == []
# Verify log message was called
mock_runtime.log.assert_any_call(
'debug',
'Reset _last_updated_mcp_stdio_servers to empty list due to runtime error',
)
# Verify an error observation was added to the event stream
mock_runtime.event_stream.add_event.assert_called_once()
# Get the observation that was added
call_args = mock_runtime.event_stream.add_event.call_args[0]
observation = call_args[0]
source = call_args[1]
# Verify it's an ErrorObservation with the right source
assert isinstance(observation, ErrorObservation)
assert source == EventSource.ENVIRONMENT
# Verify the error message contains the standard runtime error text
assert (
'Your command may have consumed too much resources'
in observation.content
)
assert 'Retry 2 of 3' in observation.content
@pytest.mark.asyncio
async def test_handle_runtime_error_on_http_error(self, mock_runtime):
"""Test that _handle_runtime_error handles HTTP errors correctly."""
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Create a mock response with a 502 status code
mock_response = Mock()
mock_response.status_code = 502
# Patch the id property to return a valid integer
with patch(
'openhands.events.action.commands.CmdRunAction.id',
new_callable=Mock,
return_value=12345,
):
# Call the error handling method directly
await Runtime._handle_runtime_error(
mock_runtime,
action,
httpx.HTTPStatusError(
'Bad Gateway', request=Mock(), response=mock_response
),
retry_count=1,
max_retries=3,
)
# Verify _last_updated_mcp_stdio_servers was reset
assert mock_runtime._last_updated_mcp_stdio_servers == []
# Verify log message was called
mock_runtime.log.assert_any_call(
'debug',
'Reset _last_updated_mcp_stdio_servers to empty list due to runtime error',
)
# Verify an error observation was added to the event stream
mock_runtime.event_stream.add_event.assert_called_once()
# Get the observation that was added
call_args = mock_runtime.event_stream.add_event.call_args[0]
observation = call_args[0]
source = call_args[1]
# Verify it's an ErrorObservation with the right source
assert isinstance(observation, ErrorObservation)
assert source == EventSource.ENVIRONMENT
# Verify the error message contains the standard runtime error text
assert (
'Your command may have consumed too much resources'
in observation.content
)
assert 'Retry 1 of 3' in observation.content
@pytest.mark.asyncio
async def test_handle_runtime_error_on_max_retries(self, mock_runtime):
"""Test that _handle_runtime_error handles max retries correctly."""
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Patch the id property to return a valid integer
with patch(
'openhands.events.action.commands.CmdRunAction.id',
new_callable=Mock,
return_value=12345,
):
# Call the error handling method directly
await Runtime._handle_runtime_error(
mock_runtime,
action,
Exception('Generic error'),
retry_count=3, # Same as max_retries
max_retries=3,
)
# Verify _last_updated_mcp_stdio_servers was reset
assert mock_runtime._last_updated_mcp_stdio_servers == []
# Verify log message was called
mock_runtime.log.assert_any_call(
'debug',
'Reset _last_updated_mcp_stdio_servers to empty list due to runtime error',
)
# Verify an error observation was added to the event stream
mock_runtime.event_stream.add_event.assert_called_once()
# Get the observation that was added
call_args = mock_runtime.event_stream.add_event.call_args[0]
observation = call_args[0]
source = call_args[1]
# Verify it's an ErrorObservation with the right source
assert isinstance(observation, ErrorObservation)
assert source == EventSource.ENVIRONMENT
# Verify the error message contains the standard runtime error text
assert (
'Your command may have consumed too much resources'
in observation.content
)
assert 'Retry 3 of 3' in observation.content
@pytest.mark.asyncio
async def test_execute_action_core(self, mock_runtime):
"""Test the _execute_action_core method."""
# Create a command action
action = CmdRunAction(command='test command')
# Mock the run_action method
mock_observation = Mock(spec=Observation)
mock_runtime.run_action = Mock(return_value=mock_observation)
# Patch the call_sync_from_async function
with patch(
'openhands.runtime.base.call_sync_from_async',
return_value=mock_observation,
):
# Call the method
result = await Runtime._execute_action_core(mock_runtime, action)
# Verify the result
assert result == mock_observation
# Verify _export_latest_git_provider_tokens was called
mock_runtime._export_latest_git_provider_tokens.assert_called_once_with(
action
)
@pytest.mark.asyncio
async def test_execute_action_core_with_mcp_action(self, mock_runtime):
"""Test the _execute_action_core method with an MCP action."""
# Create an MCP action
action = Mock(spec=MCPAction)
# Mock the call_tool_mcp method
mock_observation = Mock(spec=Observation)
mock_runtime.call_tool_mcp = AsyncMock(return_value=mock_observation)
# Call the method
result = await Runtime._execute_action_core(mock_runtime, action)
# Verify the result
assert result == mock_observation
# Verify _export_latest_git_provider_tokens was called
mock_runtime._export_latest_git_provider_tokens.assert_called_once_with(action)
# Verify call_tool_mcp was called
mock_runtime.call_tool_mcp.assert_called_once_with(action)
@pytest.mark.asyncio
async def test_handle_action_with_retry_enabled(self, mock_runtime):
"""Test _handle_action with retry enabled."""
# Configure the mock runtime
mock_runtime.config.sandbox.retry_on_unrecoverable_runtime_error = True
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Mock the _execute_action_core method
mock_observation = Mock(spec=Observation)
mock_runtime._execute_action_core = AsyncMock(return_value=mock_observation)
# Since we can't easily mock the tenacity.retry decorator directly,
# we'll test the behavior by checking that the right configuration is used
# when retry_on_unrecoverable_runtime_error is True
# Call the method with a patched _execute_action_core
await Runtime._handle_action(mock_runtime, action)
# Verify _execute_action_core was called
mock_runtime._execute_action_core.assert_called_once_with(action)
# Verify the observation was processed correctly
assert hasattr(mock_observation, '_cause')
assert hasattr(mock_observation, 'tool_call_metadata')
@pytest.mark.asyncio
async def test_handle_action_with_retry_disabled(self, mock_runtime):
"""Test _handle_action with retry disabled."""
# Configure the mock runtime
mock_runtime.config.sandbox.retry_on_unrecoverable_runtime_error = False
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Mock the _execute_action_core method
mock_observation = Mock(spec=Observation)
mock_runtime._execute_action_core = AsyncMock(return_value=mock_observation)
# Call the method
await Runtime._handle_action(mock_runtime, action)
# Verify _execute_action_core was called
mock_runtime._execute_action_core.assert_called_once_with(action)
# Verify the observation was added to the event stream
assert mock_observation._cause == action.id
assert mock_observation.tool_call_metadata == action.tool_call_metadata
@pytest.mark.asyncio
async def test_handle_action_with_runtime_error(self, mock_runtime):
"""Test _handle_action when a runtime error occurs."""
# Configure the mock runtime
mock_runtime.config.sandbox.retry_on_unrecoverable_runtime_error = False
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Mock the _execute_action_core method to raise an error
error = AgentRuntimeDisconnectedError('Runtime disconnected')
mock_runtime._execute_action_core = AsyncMock(side_effect=error)
# Call the method
await Runtime._handle_action(mock_runtime, action)
# Verify _execute_action_core was called
mock_runtime._execute_action_core.assert_called_once_with(action)
# Verify send_error_message was called
mock_runtime.send_error_message.assert_called_once_with(
'STATUS$ERROR_RUNTIME_DISCONNECTED',
'AgentRuntimeDisconnectedError: Runtime disconnected',
)
@pytest.mark.asyncio
async def test_handle_action_with_other_exception(self, mock_runtime):
"""Test _handle_action when a non-runtime error occurs."""
# Configure the mock runtime
mock_runtime.config.sandbox.retry_on_unrecoverable_runtime_error = False
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Mock the _execute_action_core method to raise an error
error = ValueError('Invalid value')
mock_runtime._execute_action_core = AsyncMock(side_effect=error)
# Call the method
await Runtime._handle_action(mock_runtime, action)
# Verify _execute_action_core was called
mock_runtime._execute_action_core.assert_called_once_with(action)
# Verify send_error_message was called
mock_runtime.send_error_message.assert_called_once_with(
'', 'ValueError: Invalid value'
)
@pytest.mark.asyncio
async def test_handle_action_with_network_error(self, mock_runtime):
"""Test _handle_action when a network error occurs."""
# Configure the mock runtime
mock_runtime.config.sandbox.retry_on_unrecoverable_runtime_error = False
# Create a command action
action = CmdRunAction(command='test command')
action.set_hard_timeout(120)
# Mock the _execute_action_core method to raise an error
error = httpx.NetworkError('Connection error')
mock_runtime._execute_action_core = AsyncMock(side_effect=error)
# Call the method
await Runtime._handle_action(mock_runtime, action)
# Verify _execute_action_core was called
mock_runtime._execute_action_core.assert_called_once_with(action)
# Verify send_error_message was called
mock_runtime.send_error_message.assert_called_once_with(
'STATUS$ERROR_RUNTIME_DISCONNECTED', 'NetworkError: Connection error'
)