SWE-Gym rollout stability fix & using a validated SWE-Gym set (#7182)

Co-authored-by: Robert Brennan <accounts@rbren.io>
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang
2025-03-17 09:15:01 -04:00
committed by GitHub
parent 4f017081fc
commit a4d632498c
6 changed files with 4287 additions and 23 deletions

View File

@@ -1,4 +1,5 @@
import asyncio
import copy
import json
import os
import tempfile
@@ -149,7 +150,8 @@ def get_config(
) -> AppConfig:
# We use a different instance image for the each instance of swe-bench eval
use_official_image = bool(
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
and 'swe-gym' not in metadata.dataset.lower()
)
base_container_image = get_instance_docker_image(
instance['instance_id'], use_official_image
@@ -475,6 +477,13 @@ def process_instance(
logger.warning(
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
)
metadata = copy.deepcopy(metadata)
metadata.details['runtime_failure_count'] = runtime_failure_count
metadata.details['remote_runtime_resource_factor'] = (
config.sandbox.remote_runtime_resource_factor
)
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
@@ -560,20 +569,6 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
return dataset
# A list of instances that are known to be tricky to infer
# (will cause runtime failure even with resource factor = 8)
SWEGYM_EXCLUDE_IDS = [
'dask__dask-10422',
'pandas-dev__pandas-50548',
'pandas-dev__pandas-53672',
'pandas-dev__pandas-54174',
'pandas-dev__pandas-55518',
'pandas-dev__pandas-58383',
'pydata__xarray-6721',
'pytest-dev__pytest-10081',
'pytest-dev__pytest-7236',
]
if __name__ == '__main__':
parser = get_parser()
parser.add_argument(
@@ -598,11 +593,20 @@ if __name__ == '__main__':
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
)
if 'SWE-Gym' in args.dataset:
swe_bench_tests = swe_bench_tests[
~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS)
]
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'split',
'swegym_verified_instances.json',
),
'r',
) as f:
swegym_verified_instances = json.load(f)
swe_bench_tests = swe_bench_tests[
swe_bench_tests['instance_id'].isin(swegym_verified_instances)
]
logger.info(
f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks'
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances'
)
llm_config = None