From f9fa1d95cbf879e8722dcde88c5bc68eabbfe384 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 7 Nov 2024 10:22:47 -0600 Subject: [PATCH] fix(RemoteRuntime): add retry for pod status after /start (#4825) --- openhands/runtime/impl/remote/remote_runtime.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index 1e6fdecf51..e74d4305be 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -90,9 +90,8 @@ class RemoteRuntime(Runtime): self.runtime_url: str | None = None async def connect(self): - await call_sync_from_async(self._start_or_attach_to_runtime) try: - await call_sync_from_async(self._wait_until_alive) + await call_sync_from_async(self._start_or_attach_to_runtime) except RuntimeNotReadyError: self.log('error', 'Runtime failed to start, timed out before ready') raise @@ -277,6 +276,14 @@ class RemoteRuntime(Runtime): assert runtime_data['runtime_id'] == self.runtime_id assert 'pod_status' in runtime_data pod_status = runtime_data['pod_status'] + + # FIXME: We should fix it at the backend of /start endpoint, make sure + # the pod is created before returning the response. + # Retry a period of time to give the cluster time to start the pod + if pod_status == 'Not Found': + raise RuntimeNotReadyError( + f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}' + ) if pod_status == 'Ready': try: self._send_request( @@ -291,7 +298,7 @@ class RemoteRuntime(Runtime): f'Runtime /alive failed to respond with 200: {e}' ) return - if pod_status in ('Failed', 'Unknown', 'Not Found'): + if pod_status in ('Failed', 'Unknown'): # clean up the runtime self.close() raise RuntimeError(