feat(eval): reliability improvement for SWE-Bench eval_infer (#6347)

2026-01-10 07:18:10 -05:00 · 2025-01-18 14:02:59 -05:00
parent 4383be1ab4
commit 2b04ee2e62
2 changed files with 15 additions and 10 deletions
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -355,7 +355,9 @@ def _process_instance_wrapper(
            )
            # e is likely an EvalException, so we can't directly infer it from type
            # but rather check if it's a fatal error
-            if is_fatal_runtime_error(str(e)):
+            # But it can also be AgentRuntime**Error (e.g., swe_bench/eval_infer.py)
+            _error_str = type(e).__name__ + ': ' + str(e)
+            if is_fatal_runtime_error(_error_str):
                runtime_failure_count += 1
                msg += f'Runtime disconnected error detected for instance {instance.instance_id}, runtime failure count: {runtime_failure_count}'
                msg += '\n' + '-' * 10 + '\n'
@@ -531,6 +533,7 @@ def is_fatal_runtime_error(error: str | None) -> bool:
        return False

    FATAL_RUNTIME_ERRORS = [
+        AgentRuntimeTimeoutError,
        AgentRuntimeUnavailableError,
        AgentRuntimeDisconnectedError,
        AgentRuntimeNotFoundError,