mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
[eval] Support SWE-Bench Multimodal (#7122)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
@@ -2,7 +2,9 @@
|
|||||||
|
|
||||||
This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).
|
This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).
|
||||||
|
|
||||||
**UPDATE (2/18/2025): We now support running SWE-Gym using the same evaluation harness here. For more details, checkout [this README](./SWE-Gym.md).
|
**UPDATE (03/27/2025): We now support SWE-Bench multimodal evaluation! Simply use "princeton-nlp/SWE-bench_Multimodal" as the dataset name in the `run_infer.sh` script to evaluate on multimodal instances.**
|
||||||
|
|
||||||
|
**UPDATE (2/18/2025): We now support running SWE-Gym using the same evaluation harness here. For more details, checkout [this README](./SWE-Gym.md).**
|
||||||
|
|
||||||
**UPDATE (7/1/2024): We now support the official SWE-Bench dockerized evaluation as announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
|
**UPDATE (7/1/2024): We now support the official SWE-Bench dockerized evaluation as announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
|
||||||
|
|
||||||
@@ -62,7 +64,7 @@ in order to use `eval_limit`, you must also set `agent`.
|
|||||||
default, it is set to 60.
|
default, it is set to 60.
|
||||||
- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
|
- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
|
||||||
default, it is set to 1.
|
default, it is set to 1.
|
||||||
- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, or `princeton-nlp/SWE-bench_Verified`, specifies which dataset to evaluate on.
|
- `dataset`, a huggingface dataset name. e.g. `princeton-nlp/SWE-bench`, `princeton-nlp/SWE-bench_Lite`, `princeton-nlp/SWE-bench_Verified`, or `princeton-nlp/SWE-bench_Multimodal`, specifies which dataset to evaluate on.
|
||||||
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
|
- `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
|
||||||
|
|
||||||
> [!CAUTION]
|
> [!CAUTION]
|
||||||
@@ -82,6 +84,13 @@ then your command would be:
|
|||||||
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
|
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For multimodal evaluation, you can use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Example for running multimodal SWE-Bench evaluation
|
||||||
|
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval_gpt4_vision HEAD CodeActAgent 10 100 1 princeton-nlp/SWE-bench_Multimodal test
|
||||||
|
```
|
||||||
|
|
||||||
### Running in parallel with RemoteRuntime
|
### Running in parallel with RemoteRuntime
|
||||||
|
|
||||||
OpenHands Remote Runtime is currently in beta (read [here](https://runtime.all-hands.dev/) for more details), it allows you to run rollout in parallel in the cloud, so you don't need a powerful machine to run evaluation.
|
OpenHands Remote Runtime is currently in beta (read [here](https://runtime.all-hands.dev/) for more details), it allows you to run rollout in parallel in the cloud, so you don't need a powerful machine to run evaluation.
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
|||||||
return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
||||||
|
|
||||||
|
|
||||||
def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
|
||||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||||
instruction = f"""
|
instruction = f"""
|
||||||
<uploaded_files>
|
<uploaded_files>
|
||||||
@@ -114,12 +114,20 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if RUN_WITH_BROWSING:
|
if RUN_WITH_BROWSING:
|
||||||
instruction += """
|
instruction += (
|
||||||
<IMPORTANT!>
|
'<IMPORTANT!>\n'
|
||||||
You SHOULD NEVER attempt to browse the web.
|
'You SHOULD NEVER attempt to browse the web. '
|
||||||
</IMPORTANT!>
|
'</IMPORTANT!>\n'
|
||||||
"""
|
)
|
||||||
return instruction
|
|
||||||
|
if 'image_assets' in instance:
|
||||||
|
assets = instance['image_assets']
|
||||||
|
assert (
|
||||||
|
'problem_statement' in assets
|
||||||
|
), 'problem_statement is required in image_assets'
|
||||||
|
image_urls = assets['problem_statement']
|
||||||
|
return MessageAction(content=instruction, image_urls=image_urls)
|
||||||
|
return MessageAction(content=instruction)
|
||||||
|
|
||||||
|
|
||||||
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
||||||
@@ -129,14 +137,18 @@ DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get(
|
|||||||
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
|
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
|
||||||
|
|
||||||
|
|
||||||
def get_instance_docker_image(instance_id: str, official_image: bool = False) -> str:
|
def get_instance_docker_image(
|
||||||
if official_image:
|
instance_id: str,
|
||||||
|
swebench_official_image: bool = False,
|
||||||
|
) -> str:
|
||||||
|
if swebench_official_image:
|
||||||
# Official SWE-Bench image
|
# Official SWE-Bench image
|
||||||
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
|
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
|
||||||
docker_image_prefix = 'docker.io/swebench/'
|
docker_image_prefix = 'docker.io/swebench/'
|
||||||
repo, name = instance_id.split('__')
|
repo, name = instance_id.split('__')
|
||||||
image_name = f'sweb.eval.x86_64.{repo}_1776_{name}:latest'
|
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
|
||||||
logger.warning(f'Using official SWE-Bench image: {image_name}')
|
logger.info(f'Using official SWE-Bench image: {image_name}')
|
||||||
|
return image_name
|
||||||
else:
|
else:
|
||||||
# OpenHands version of the image
|
# OpenHands version of the image
|
||||||
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX
|
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX
|
||||||
@@ -144,7 +156,7 @@ def get_instance_docker_image(instance_id: str, official_image: bool = False) ->
|
|||||||
image_name = image_name.replace(
|
image_name = image_name.replace(
|
||||||
'__', '_s_'
|
'__', '_s_'
|
||||||
) # to comply with docker image naming convention
|
) # to comply with docker image naming convention
|
||||||
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()
|
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()
|
||||||
|
|
||||||
|
|
||||||
def get_config(
|
def get_config(
|
||||||
@@ -152,12 +164,13 @@ def get_config(
|
|||||||
metadata: EvalMetadata,
|
metadata: EvalMetadata,
|
||||||
) -> AppConfig:
|
) -> AppConfig:
|
||||||
# We use a different instance image for the each instance of swe-bench eval
|
# We use a different instance image for the each instance of swe-bench eval
|
||||||
use_official_image = bool(
|
use_swebench_official_image = bool(
|
||||||
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
|
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
|
||||||
and 'swe-gym' not in metadata.dataset.lower()
|
and 'swe-gym' not in metadata.dataset.lower()
|
||||||
)
|
)
|
||||||
base_container_image = get_instance_docker_image(
|
base_container_image = get_instance_docker_image(
|
||||||
instance['instance_id'], use_official_image
|
instance['instance_id'],
|
||||||
|
swebench_official_image=use_swebench_official_image,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f'Using instance container image: {base_container_image}. '
|
f'Using instance container image: {base_container_image}. '
|
||||||
@@ -493,13 +506,13 @@ def process_instance(
|
|||||||
try:
|
try:
|
||||||
initialize_runtime(runtime, instance)
|
initialize_runtime(runtime, instance)
|
||||||
|
|
||||||
instruction = get_instruction(instance, metadata)
|
message_action = get_instruction(instance, metadata)
|
||||||
|
|
||||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||||
state: State | None = asyncio.run(
|
state: State | None = asyncio.run(
|
||||||
run_controller(
|
run_controller(
|
||||||
config=config,
|
config=config,
|
||||||
initial_user_action=MessageAction(content=instruction),
|
initial_user_action=message_action,
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||||
metadata.agent_class
|
metadata.agent_class
|
||||||
@@ -539,6 +552,11 @@ def process_instance(
|
|||||||
metrics = get_metrics(state)
|
metrics = get_metrics(state)
|
||||||
|
|
||||||
# Save the output
|
# Save the output
|
||||||
|
instruction = message_action.content
|
||||||
|
if message_action.image_urls:
|
||||||
|
instruction += (
|
||||||
|
'\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>'
|
||||||
|
)
|
||||||
output = EvalOutput(
|
output = EvalOutput(
|
||||||
instance_id=instance.instance_id,
|
instance_id=instance.instance_id,
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
|
|||||||
Reference in New Issue
Block a user