diff --git a/evaluation/benchmarks/swe_bench/resource/mapping.py b/evaluation/benchmarks/swe_bench/resource/mapping.py index be6ddad621..d29d9a11c1 100644 --- a/evaluation/benchmarks/swe_bench/resource/mapping.py +++ b/evaluation/benchmarks/swe_bench/resource/mapping.py @@ -28,7 +28,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]: with open(file_path, 'r') as f: _global_resource_mapping[dataset_name] = json.load(f) - logger.info(f'Loaded resource mapping for {dataset_name}') + logger.debug(f'Loaded resource mapping for {dataset_name}') return _global_resource_mapping[dataset_name] diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 23ce3a1cc9..57ab8ed20d 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -121,7 +121,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think ) if 'image_assets' in instance: - assets = instance['image_assets'] + assets = json.loads(instance['image_assets']) assert ( 'problem_statement' in assets ), 'problem_statement is required in image_assets' @@ -146,8 +146,8 @@ def get_instance_docker_image( # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 docker_image_prefix = 'docker.io/swebench/' repo, name = instance_id.split('__') - image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest' - logger.info(f'Using official SWE-Bench image: {image_name}') + image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower() + logger.debug(f'Using official SWE-Bench image: {image_name}') return image_name else: # OpenHands version of the image @@ -164,10 +164,7 @@ def get_config( metadata: EvalMetadata, ) -> AppConfig: # We use a different instance image for the each instance of swe-bench eval - use_swebench_official_image = bool( - ('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()) - and 'swe-gym' not in metadata.dataset.lower() - ) + use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower() base_container_image = get_instance_docker_image( instance['instance_id'], swebench_official_image=use_swebench_official_image, @@ -334,15 +331,18 @@ def initialize_runtime( logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}') - action = CmdRunAction(command='which python') - action.set_hard_timeout(600) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - obs.exit_code == 0 and 'testbed' in obs.content, - f'Expected to find python interpreter from testbed, but got: {str(obs)}', - ) + if 'multimodal' not in metadata.dataset.lower(): + # Only for non-multimodal datasets, we need to activate the testbed environment for Python + # SWE-Bench multimodal datasets are not using the testbed environment + action = CmdRunAction(command='which python') + action.set_hard_timeout(600) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + obs.exit_code == 0 and 'testbed' in obs.content, + f'Expected to find python interpreter from testbed, but got: {str(obs)}', + ) logger.info('-' * 30) logger.info('END Runtime Initialization Fn') @@ -761,9 +761,19 @@ if __name__ == '__main__': with open(cur_output_file, 'r') as f: for line in f: instance = json.loads(line) - history = [event_from_dict(event) for event in instance['history']] - critic_result = critic.evaluate(history) - if not critic_result.success: + try: + history = [ + event_from_dict(event) for event in instance['history'] + ] + critic_result = critic.evaluate( + history, instance['test_result'].get('git_patch', '') + ) + if not critic_result.success: + instances_failed.append(instance['instance_id']) + except Exception as e: + logger.error( + f'Error loading history for instance {instance["instance_id"]}: {e}' + ) instances_failed.append(instance['instance_id']) logger.info( f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}' diff --git a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh index 6bacc93fde..61ca1e1510 100755 --- a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh +++ b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh @@ -18,6 +18,7 @@ if [[ -z "$item" ]]; then exit 1 fi + WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")') echo "WORKSPACE_NAME: $WORKSPACE_NAME" @@ -36,5 +37,7 @@ mkdir -p /workspace cp -r /testbed /workspace/$WORKSPACE_NAME # Activate instance-specific environment -. /opt/miniconda3/etc/profile.d/conda.sh -conda activate testbed +if [ -d /opt/miniconda3 ]; then + . /opt/miniconda3/etc/profile.d/conda.sh + conda activate testbed +fi diff --git a/openhands/critic/base.py b/openhands/critic/base.py index 58a125850e..bfde3827d8 100644 --- a/openhands/critic/base.py +++ b/openhands/critic/base.py @@ -23,9 +23,11 @@ class CriticResult(BaseModel): class BaseCritic(abc.ABC): """ - A critic is a function that takes in a list of events and returns a score about the quality of those events. + A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events. """ @abc.abstractmethod - def evaluate(self, events: list[Event]) -> CriticResult: + def evaluate( + self, events: list[Event], git_patch: str | None = None + ) -> CriticResult: pass diff --git a/openhands/critic/finish_critic.py b/openhands/critic/finish_critic.py index 8da3bc2c06..ca8dcd872b 100644 --- a/openhands/critic/finish_critic.py +++ b/openhands/critic/finish_critic.py @@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction class AgentFinishedCritic(BaseCritic): """This is a simple rule-based critic that checks if the last event is an AgentFinishAction. - If not, it will return a score of 0 and a message indicating that the agent did not finish. + If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty. """ def __init__(self): pass - def evaluate(self, events: list[Event]) -> CriticResult: + def evaluate( + self, events: list[Event], git_patch: str | None = None + ) -> CriticResult: last_action = next((h for h in reversed(events) if isinstance(h, Action)), None) + if git_patch is not None and len(git_patch.strip()) == 0: + return CriticResult(score=0, message='Git patch is empty.') + if isinstance(last_action, AgentFinishAction): return CriticResult(score=1, message='Agent finished.') else: