mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
Add detailed logging to remote runtime resume process (#8819)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
@@ -100,7 +100,6 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
self.available_hosts: dict[str, int] = {}
|
||||
|
||||
def log(self, level: str, message: str, exc_info: bool | None = None) -> None:
|
||||
message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}'
|
||||
getattr(logger, level)(
|
||||
message,
|
||||
stacklevel=2,
|
||||
@@ -128,14 +127,17 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
self._runtime_initialized = True
|
||||
|
||||
def _start_or_attach_to_runtime(self) -> None:
|
||||
self.log('info', 'Starting or attaching to runtime')
|
||||
existing_runtime = self._check_existing_runtime()
|
||||
if existing_runtime:
|
||||
self.log('debug', f'Using existing runtime with ID: {self.runtime_id}')
|
||||
self.log('info', f'Using existing runtime with ID: {self.runtime_id}')
|
||||
elif self.attach_to_existing:
|
||||
self.log('info', f'Failed to find existing runtime for SID: {self.sid}')
|
||||
raise AgentRuntimeNotFoundError(
|
||||
f'Could not find existing runtime for SID: {self.sid}'
|
||||
)
|
||||
else:
|
||||
self.log('info', 'No existing runtime found, starting a new one')
|
||||
self.send_status_message('STATUS$STARTING_CONTAINER')
|
||||
if self.config.sandbox.runtime_container_image is None:
|
||||
self.log(
|
||||
@@ -165,6 +167,7 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
self.send_status_message(' ')
|
||||
|
||||
def _check_existing_runtime(self) -> bool:
|
||||
self.log('info', f'Checking for existing runtime with session ID: {self.sid}')
|
||||
try:
|
||||
response = self._send_runtime_api_request(
|
||||
'GET',
|
||||
@@ -172,12 +175,16 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
)
|
||||
data = response.json()
|
||||
status = data.get('status')
|
||||
self.log('info', f'Found runtime with status: {status}')
|
||||
if status == 'running' or status == 'paused':
|
||||
self._parse_runtime_response(response)
|
||||
except httpx.HTTPError as e:
|
||||
if e.response.status_code == 404:
|
||||
self.log(
|
||||
'info', f'No existing runtime found for session ID: {self.sid}'
|
||||
)
|
||||
return False
|
||||
self.log('debug', f'Error while looking for remote runtime: {e}')
|
||||
self.log('error', f'Error while looking for remote runtime: {e}')
|
||||
raise
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
self.log(
|
||||
@@ -187,14 +194,25 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
raise
|
||||
|
||||
if status == 'running':
|
||||
self.log('info', 'Found existing runtime in running state')
|
||||
return True
|
||||
elif status == 'stopped':
|
||||
self.log('debug', 'Found existing remote runtime, but it is stopped')
|
||||
self.log('info', 'Found existing runtime, but it is stopped')
|
||||
return False
|
||||
elif status == 'paused':
|
||||
self.log('debug', 'Found existing remote runtime, but it is paused')
|
||||
self._resume_runtime()
|
||||
return True
|
||||
self.log(
|
||||
'info', 'Found existing runtime in paused state, attempting to resume'
|
||||
)
|
||||
try:
|
||||
self._resume_runtime()
|
||||
self.log('info', 'Successfully resumed paused runtime')
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'error', f'Failed to resume paused runtime: {e}', exc_info=True
|
||||
)
|
||||
# Return false to indicate we couldn't use the existing runtime
|
||||
return False
|
||||
else:
|
||||
self.log('error', f'Invalid response from runtime API: {data}')
|
||||
return False
|
||||
@@ -287,16 +305,48 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
3. Poll for the runtime to be ready
|
||||
4. Update env vars
|
||||
"""
|
||||
self.log('info', f'Attempting to resume runtime with ID: {self.runtime_id}')
|
||||
self.send_status_message('STATUS$STARTING_RUNTIME')
|
||||
self._send_runtime_api_request(
|
||||
'POST',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/resume',
|
||||
json={'runtime_id': self.runtime_id},
|
||||
try:
|
||||
response = self._send_runtime_api_request(
|
||||
'POST',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/resume',
|
||||
json={'runtime_id': self.runtime_id},
|
||||
)
|
||||
self.log(
|
||||
'info',
|
||||
f'Resume API call successful with status code: {response.status_code}',
|
||||
)
|
||||
except Exception as e:
|
||||
self.log('error', f'Failed to call /resume API: {e}', exc_info=True)
|
||||
raise
|
||||
|
||||
self.log(
|
||||
'info', 'Runtime resume API call completed, waiting for it to be alive...'
|
||||
)
|
||||
self.log('info', 'Runtime resumed, waiting for it to be alive...')
|
||||
self._wait_until_alive()
|
||||
self.setup_initial_env()
|
||||
self.log('info', 'Runtime resumed and alive.')
|
||||
try:
|
||||
self._wait_until_alive()
|
||||
self.log('info', 'Runtime is now alive after resume')
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'error',
|
||||
f'Runtime failed to become alive after resume: {e}',
|
||||
exc_info=True,
|
||||
)
|
||||
raise
|
||||
|
||||
try:
|
||||
self.setup_initial_env()
|
||||
self.log('info', 'Successfully set up initial environment after resume')
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'error',
|
||||
f'Failed to set up initial environment after resume: {e}',
|
||||
exc_info=True,
|
||||
)
|
||||
raise
|
||||
|
||||
self.log('info', 'Runtime successfully resumed and alive.')
|
||||
|
||||
def _parse_runtime_response(self, response: httpx.Response) -> None:
|
||||
start_response = response.json()
|
||||
@@ -491,10 +541,32 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
) from e
|
||||
elif hasattr(e, 'response') and e.response.status_code == 503:
|
||||
if self.config.sandbox.keep_runtime_alive:
|
||||
self.log('warning', 'Runtime appears to be paused. Resuming...')
|
||||
self._resume_runtime()
|
||||
return super()._send_action_server_request(method, url, **kwargs)
|
||||
self.log(
|
||||
'info',
|
||||
f'Runtime appears to be paused (503 response). Runtime ID: {self.runtime_id}, URL: {url}',
|
||||
)
|
||||
try:
|
||||
self._resume_runtime()
|
||||
self.log(
|
||||
'info', 'Successfully resumed runtime after 503 response'
|
||||
)
|
||||
return super()._send_action_server_request(
|
||||
method, url, **kwargs
|
||||
)
|
||||
except Exception as resume_error:
|
||||
self.log(
|
||||
'error',
|
||||
f'Failed to resume runtime after 503 response: {resume_error}',
|
||||
exc_info=True,
|
||||
)
|
||||
raise AgentRuntimeDisconnectedError(
|
||||
f'Runtime is paused and could not be resumed. Original error: {e}, Resume error: {resume_error}'
|
||||
) from resume_error
|
||||
else:
|
||||
self.log(
|
||||
'info',
|
||||
'Runtime appears to be paused (503 response) but keep_runtime_alive is False',
|
||||
)
|
||||
raise AgentRuntimeDisconnectedError(
|
||||
f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}'
|
||||
) from e
|
||||
|
||||
Reference in New Issue
Block a user