Add detailed logging to remote runtime resume process (#8819)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-09 14:57:59 -05:00 · 2025-06-02 13:12:24 -04:00
parent 72c24b461c
commit 21af52560a
1 changed files with 90 additions and 18 deletions
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -100,7 +100,6 @@ class RemoteRuntime(ActionExecutionClient):
        self.available_hosts: dict[str, int] = {}

    def log(self, level: str, message: str, exc_info: bool | None = None) -> None:
-        message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}'
        getattr(logger, level)(
            message,
            stacklevel=2,
@@ -128,14 +127,17 @@ class RemoteRuntime(ActionExecutionClient):
        self._runtime_initialized = True

    def _start_or_attach_to_runtime(self) -> None:
+        self.log('info', 'Starting or attaching to runtime')
        existing_runtime = self._check_existing_runtime()
        if existing_runtime:
-            self.log('debug', f'Using existing runtime with ID: {self.runtime_id}')
+            self.log('info', f'Using existing runtime with ID: {self.runtime_id}')
        elif self.attach_to_existing:
+            self.log('info', f'Failed to find existing runtime for SID: {self.sid}')
            raise AgentRuntimeNotFoundError(
                f'Could not find existing runtime for SID: {self.sid}'
            )
        else:
+            self.log('info', 'No existing runtime found, starting a new one')
            self.send_status_message('STATUS$STARTING_CONTAINER')
            if self.config.sandbox.runtime_container_image is None:
                self.log(
@@ -165,6 +167,7 @@ class RemoteRuntime(ActionExecutionClient):
        self.send_status_message(' ')

    def _check_existing_runtime(self) -> bool:
+        self.log('info', f'Checking for existing runtime with session ID: {self.sid}')
        try:
            response = self._send_runtime_api_request(
                'GET',
@@ -172,12 +175,16 @@ class RemoteRuntime(ActionExecutionClient):
            )
            data = response.json()
            status = data.get('status')
+            self.log('info', f'Found runtime with status: {status}')
            if status == 'running' or status == 'paused':
                self._parse_runtime_response(response)
        except httpx.HTTPError as e:
            if e.response.status_code == 404:
+                self.log(
+                    'info', f'No existing runtime found for session ID: {self.sid}'
+                )
                return False
-            self.log('debug', f'Error while looking for remote runtime: {e}')
+            self.log('error', f'Error while looking for remote runtime: {e}')
            raise
        except json.decoder.JSONDecodeError as e:
            self.log(
@@ -187,14 +194,25 @@ class RemoteRuntime(ActionExecutionClient):
            raise

        if status == 'running':
+            self.log('info', 'Found existing runtime in running state')
            return True
        elif status == 'stopped':
-            self.log('debug', 'Found existing remote runtime, but it is stopped')
+            self.log('info', 'Found existing runtime, but it is stopped')
            return False
        elif status == 'paused':
-            self.log('debug', 'Found existing remote runtime, but it is paused')
-            self._resume_runtime()
-            return True
+            self.log(
+                'info', 'Found existing runtime in paused state, attempting to resume'
+            )
+            try:
+                self._resume_runtime()
+                self.log('info', 'Successfully resumed paused runtime')
+                return True
+            except Exception as e:
+                self.log(
+                    'error', f'Failed to resume paused runtime: {e}', exc_info=True
+                )
+                # Return false to indicate we couldn't use the existing runtime
+                return False
        else:
            self.log('error', f'Invalid response from runtime API: {data}')
            return False
@@ -287,16 +305,48 @@ class RemoteRuntime(ActionExecutionClient):
        3. Poll for the runtime to be ready
        4. Update env vars
        """
+        self.log('info', f'Attempting to resume runtime with ID: {self.runtime_id}')
        self.send_status_message('STATUS$STARTING_RUNTIME')
-        self._send_runtime_api_request(
-            'POST',
-            f'{self.config.sandbox.remote_runtime_api_url}/resume',
-            json={'runtime_id': self.runtime_id},
+        try:
+            response = self._send_runtime_api_request(
+                'POST',
+                f'{self.config.sandbox.remote_runtime_api_url}/resume',
+                json={'runtime_id': self.runtime_id},
+            )
+            self.log(
+                'info',
+                f'Resume API call successful with status code: {response.status_code}',
+            )
+        except Exception as e:
+            self.log('error', f'Failed to call /resume API: {e}', exc_info=True)
+            raise
+
+        self.log(
+            'info', 'Runtime resume API call completed, waiting for it to be alive...'
        )
-        self.log('info', 'Runtime resumed, waiting for it to be alive...')
-        self._wait_until_alive()
-        self.setup_initial_env()
-        self.log('info', 'Runtime resumed and alive.')
+        try:
+            self._wait_until_alive()
+            self.log('info', 'Runtime is now alive after resume')
+        except Exception as e:
+            self.log(
+                'error',
+                f'Runtime failed to become alive after resume: {e}',
+                exc_info=True,
+            )
+            raise
+
+        try:
+            self.setup_initial_env()
+            self.log('info', 'Successfully set up initial environment after resume')
+        except Exception as e:
+            self.log(
+                'error',
+                f'Failed to set up initial environment after resume: {e}',
+                exc_info=True,
+            )
+            raise
+
+        self.log('info', 'Runtime successfully resumed and alive.')

    def _parse_runtime_response(self, response: httpx.Response) -> None:
        start_response = response.json()
@@ -491,10 +541,32 @@ class RemoteRuntime(ActionExecutionClient):
                    ) from e
            elif hasattr(e, 'response') and e.response.status_code == 503:
                if self.config.sandbox.keep_runtime_alive:
-                    self.log('warning', 'Runtime appears to be paused. Resuming...')
-                    self._resume_runtime()
-                    return super()._send_action_server_request(method, url, **kwargs)
+                    self.log(
+                        'info',
+                        f'Runtime appears to be paused (503 response). Runtime ID: {self.runtime_id}, URL: {url}',
+                    )
+                    try:
+                        self._resume_runtime()
+                        self.log(
+                            'info', 'Successfully resumed runtime after 503 response'
+                        )
+                        return super()._send_action_server_request(
+                            method, url, **kwargs
+                        )
+                    except Exception as resume_error:
+                        self.log(
+                            'error',
+                            f'Failed to resume runtime after 503 response: {resume_error}',
+                            exc_info=True,
+                        )
+                        raise AgentRuntimeDisconnectedError(
+                            f'Runtime is paused and could not be resumed. Original error: {e}, Resume error: {resume_error}'
+                        ) from resume_error
                else:
+                    self.log(
+                        'info',
+                        'Runtime appears to be paused (503 response) but keep_runtime_alive is False',
+                    )
                    raise AgentRuntimeDisconnectedError(
                        f'Runtime is temporarily unavailable. This may be due to a restart or network issue, please try again. Original error: {e}'
                    ) from e