Add detailed logging to remote runtime resume process (#8819)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Robert Brennan
2025-06-02 13:12:24 -04:00
committed by GitHub
parent 72c24b461c
commit 21af52560a

View File

@@ -100,7 +100,6 @@ class RemoteRuntime(ActionExecutionClient):
self.available_hosts: dict[str, int] = {}
def log(self, level: str, message: str, exc_info: bool | None = None) -> None:
message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}'
getattr(logger, level)(
message,
stacklevel=2,
@@ -128,14 +127,17 @@ class RemoteRuntime(ActionExecutionClient):
self._runtime_initialized = True
def _start_or_attach_to_runtime(self) -> None:
self.log('info', 'Starting or attaching to runtime')
existing_runtime = self._check_existing_runtime()
if existing_runtime:
self.log('debug', f'Using existing runtime with ID: {self.runtime_id}')
self.log('info', f'Using existing runtime with ID: {self.runtime_id}')
elif self.attach_to_existing:
self.log('info', f'Failed to find existing runtime for SID: {self.sid}')
raise AgentRuntimeNotFoundError(
f'Could not find existing runtime for SID: {self.sid}'
)
else:
self.log('info', 'No existing runtime found, starting a new one')
self.send_status_message('STATUS$STARTING_CONTAINER')
if self.config.sandbox.runtime_container_image is None:
self.log(
@@ -165,6 +167,7 @@ class RemoteRuntime(ActionExecutionClient):
self.send_status_message(' ')
def _check_existing_runtime(self) -> bool:
self.log('info', f'Checking for existing runtime with session ID: {self.sid}')
try:
response = self._send_runtime_api_request(
'GET',
@@ -172,12 +175,16 @@ class RemoteRuntime(ActionExecutionClient):
)
data = response.json()
status = data.get('status')
self.log('info', f'Found runtime with status: {status}')
if status == 'running' or status == 'paused':
self._parse_runtime_response(response)
except httpx.HTTPError as e:
if e.response.status_code == 404:
self.log(
'info', f'No existing runtime found for session ID: {self.sid}'
)
return False
self.log('debug', f'Error while looking for remote runtime: {e}')
self.log('error', f'Error while looking for remote runtime: {e}')
raise
except json.decoder.JSONDecodeError as e:
self.log(
@@ -187,14 +194,25 @@ class RemoteRuntime(ActionExecutionClient):
raise
if status == 'running':
self.log('info', 'Found existing runtime in running state')
return True
elif status == 'stopped':
self.log('debug', 'Found existing remote runtime, but it is stopped')
self.log('info', 'Found existing runtime, but it is stopped')
return False
elif status == 'paused':
self.log('debug', 'Found existing remote runtime, but it is paused')
self._resume_runtime()
return True
self.log(
'info', 'Found existing runtime in paused state, attempting to resume'
)
try:
self._resume_runtime()
self.log('info', 'Successfully resumed paused runtime')
return True
except Exception as e:
self.log(
'error', f'Failed to resume paused runtime: {e}', exc_info=True
)
# Return false to indicate we couldn't use the existing runtime
return False
else:
self.log('error', f'Invalid response from runtime API: {data}')
return False
@@ -287,16 +305,48 @@ class RemoteRuntime(ActionExecutionClient):
3. Poll for the runtime to be ready
4. Update env vars
"""
self.log('info', f'Attempting to resume runtime with ID: {self.runtime_id}')
self.send_status_message('STATUS$STARTING_RUNTIME')
self._send_runtime_api_request(
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/resume',
json={'runtime_id': self.runtime_id},
try:
response = self._send_runtime_api_request(
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/resume',
json={'runtime_id': self.runtime_id},
)
self.log(
'info',
f'Resume API call successful with status code: {response.status_code}',
)
except Exception as e:
self.log('error', f'Failed to call /resume API: {e}', exc_info=True)
raise
self.log(
'info', 'Runtime resume API call completed, waiting for it to be alive...'
)
self.log('info', 'Runtime resumed, waiting for it to be alive...')
self._wait_until_alive()
self.setup_initial_env()
self.log('info', 'Runtime resumed and alive.')
try:
self._wait_until_alive()
self.log('info', 'Runtime is now alive after resume')
except Exception as e:
self.log(
'error',
f'Runtime failed to become alive after resume: {e}',
exc_info=True,
)
raise
try:
self.setup_initial_env()
self.log('info', 'Successfully set up initial environment after resume')
except Exception as e:
self.log(
'error',
f'Failed to set up initial environment after resume: {e}',
exc_info=True,
)
raise
self.log('info', 'Runtime successfully resumed and alive.')
def _parse_runtime_response(self, response: httpx.Response) -> None:
start_response = response.json()
@@ -491,10 +541,32 @@ class RemoteRuntime(ActionExecutionClient):
) from e
elif hasattr(e, 'response') and e.response.status_code == 503:
if self.config.sandbox.keep_runtime_alive:
self.log('warning', 'Runtime appears to be paused. Resuming...')
self._resume_runtime()
return super()._send_action_server_request(method, url, **kwargs)
self.log(
'info',
f'Runtime appears to be paused (503 response). Runtime ID: {self.runtime_id}, URL: {url}',
)
try:
self._resume_runtime()
self.log(
'info', 'Successfully resumed runtime after 503 response'
)
return super()._send_action_server_request(
method, url, **kwargs
)
except Exception as resume_error:
self.log(
'error',
f'Failed to resume runtime after 503 response: {resume_error}',
exc_info=True,
)
raise AgentRuntimeDisconnectedError(
f'Runtime is paused and could not be resumed. Original error: {e}, Resume error: {resume_error}'
) from resume_error
else:
self.log(
'info',
'Runtime appears to be paused (503 response) but keep_runtime_alive is False',
)
raise AgentRuntimeDisconnectedError(
f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}'
) from e