fix: Properly propagate AgentRuntimeTimeoutError to evaluation loop

2026-04-29 03:00:45 -04:00 · 2025-07-26 17:01:34 +00:00
2 changed files with 24 additions and 2 deletions
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -641,7 +641,9 @@ def process_instance(
            )
        )

-        # if fatal error, throw EvalError to trigger re-run
+        # if state is None or has a fatal error, throw EvalError to trigger re-run
+        if state is None:
+            raise EvalException('State is None, likely due to a runtime error')
        if is_fatal_evaluation_error(state.last_error):
            raise EvalException('Fatal error detected: ' + state.last_error)

@@ -671,8 +673,9 @@ def process_instance(

    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    # This check is redundant since we already check above, but keeping it for safety
    if state is None:
-        raise ValueError('State should not be None.')
+        raise EvalException('State is None, likely due to a runtime error')

    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
    histories = [event_to_dict(event) for event in state.history]
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -15,6 +15,12 @@ from openhands.core.config import (
    setup_config_from_args,
 )
 from openhands.core.config.mcp_config import OpenHandsMCPConfigImpl
+from openhands.core.exceptions import (
+    AgentRuntimeDisconnectedError,
+    AgentRuntimeNotFoundError,
+    AgentRuntimeTimeoutError,
+    AgentRuntimeUnavailableError,
+)
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.loop import run_agent_until_done
 from openhands.core.schema import AgentState
@@ -207,6 +213,19 @@ async def run_controller(
        await run_agent_until_done(controller, runtime, memory, end_states)
    except Exception as e:
        logger.error(f'Exception in main loop: {e}')
+        # Set the error in the state so it can be detected by is_fatal_evaluation_error
+        controller.state.last_error = f'{type(e).__name__}: {str(e)}'
+        # If it's a fatal runtime error, we should re-raise it so the evaluation loop can handle it
+        if isinstance(
+            e,
+            (
+                AgentRuntimeTimeoutError,
+                AgentRuntimeDisconnectedError,
+                AgentRuntimeUnavailableError,
+                AgentRuntimeNotFoundError,
+            ),
+        ):
+            raise e

    # save session when we're about to close
    if config.file_store is not None and config.file_store != 'memory':