[eval] Upgrade SWE-Bench to use official image and latest harness (#6838)

Co-authored-by: Robert Brennan <accounts@rbren.io>
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang
2025-02-27 08:15:05 -05:00
committed by GitHub
parent 0137600988
commit 33780f97d0
13 changed files with 3267 additions and 2778 deletions

View File

@@ -11,7 +11,11 @@ from swebench.harness.run_evaluation import (
APPLY_PATCH_FAIL, APPLY_PATCH_FAIL,
APPLY_PATCH_PASS, APPLY_PATCH_PASS,
) )
from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec from swebench.harness.test_spec.test_spec import (
SWEbenchInstance,
TestSpec,
make_test_spec,
)
from swebench.harness.utils import load_swebench_dataset from swebench.harness.utils import load_swebench_dataset
from tqdm import tqdm from tqdm import tqdm

View File

@@ -58,8 +58,6 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
def get_instruction(instance: pd.Series, metadata: EvalMetadata): def get_instruction(instance: pd.Series, metadata: EvalMetadata):
workspace_dir_name = _get_swebench_workspace_dir_name(instance) workspace_dir_name = _get_swebench_workspace_dir_name(instance)
# Prepare instruction
# Instruction based on Anthropic's official trajectory # Instruction based on Anthropic's official trajectory
# https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
instruction = ( instruction = (
@@ -71,14 +69,20 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
f'{instance.problem_statement}\n' f'{instance.problem_statement}\n'
'</issue_description>\n\n' '</issue_description>\n\n'
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n' 'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
"I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n" "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.\n' "Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
'Follow these steps to resolve the issue:\n' 'Follow these steps to resolve the issue:\n'
'1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n' '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
'2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error\n' '2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error\n'
'3. Edit the sourcecode of the repo to resolve the issue\n' '3. Edit the sourcecode of the repo to resolve the issue\n'
'4. Rerun your reproduce script and confirm that the error is fixed!\n' '4. Rerun your reproduce script and confirm that the error is fixed!\n'
'5. Think about edgecases and make sure your fix handles them as well\n' '5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well\n'
f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
' - The issue you are fixing\n'
' - The files you modified\n'
' - The functions you changed\n'
' Make sure all these tests pass with your changes.\n'
"Your thinking should be thorough and so it's fine if it's very long.\n" "Your thinking should be thorough and so it's fine if it's very long.\n"
) )
@@ -96,11 +100,19 @@ DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xing
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}') logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
def get_instance_docker_image(instance_id: str) -> str: def get_instance_docker_image(instance_id: str, official_image: bool = False) -> str:
image_name = 'sweb.eval.x86_64.' + instance_id if official_image:
image_name = image_name.replace( # Official SWE-Bench image
'__', '_s_' # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
) # to comply with docker image naming convention repo, name = instance_id.split('__')
image_name = f'sweb.eval.x86_64.{repo}_1776_{name}:latest'
logger.warning(f'Using official SWE-Bench image: {image_name}')
else:
# OpenHands version of the image
image_name = 'sweb.eval.x86_64.' + instance_id
image_name = image_name.replace(
'__', '_s_'
) # to comply with docker image naming convention
return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower() return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
@@ -111,7 +123,12 @@ def get_config(
SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1' SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
if USE_INSTANCE_IMAGE: if USE_INSTANCE_IMAGE:
# We use a different instance image for the each instance of swe-bench eval # We use a different instance image for the each instance of swe-bench eval
base_container_image = get_instance_docker_image(instance['instance_id']) use_official_image = bool(
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
)
base_container_image = get_instance_docker_image(
instance['instance_id'], use_official_image
)
logger.info( logger.info(
f'Using instance container image: {base_container_image}. ' f'Using instance container image: {base_container_image}. '
f'Please make sure this image exists. ' f'Please make sure this image exists. '

View File

@@ -1,336 +1,300 @@
sweb.base.x86_64:latest swebench/sweb.eval.x86_64.astropy_1776_astropy-12907:latest
sweb.env.x86_64.088a7e628bda9770f9757b:latest swebench/sweb.eval.x86_64.astropy_1776_astropy-14182:latest
sweb.env.x86_64.0d80c7dec81ee2f2f513e2:latest swebench/sweb.eval.x86_64.astropy_1776_astropy-14365:latest
sweb.env.x86_64.0f99bce2750f3109957bec:latest swebench/sweb.eval.x86_64.astropy_1776_astropy-14995:latest
sweb.env.x86_64.1b3b218535da0abf4469cb:latest swebench/sweb.eval.x86_64.astropy_1776_astropy-6938:latest
sweb.env.x86_64.1c1a6945f732f9391228c5:latest swebench/sweb.eval.x86_64.astropy_1776_astropy-7746:latest
sweb.env.x86_64.1f92e6d7cef88badc4f744:latest swebench/sweb.eval.x86_64.django_1776_django-10914:latest
sweb.env.x86_64.27dd9791e13f5c857a09f9:latest swebench/sweb.eval.x86_64.django_1776_django-10924:latest
sweb.env.x86_64.297af196949a2a635bce66:latest swebench/sweb.eval.x86_64.django_1776_django-11001:latest
sweb.env.x86_64.2baaea72acc974f6c02079:latest swebench/sweb.eval.x86_64.django_1776_django-11019:latest
sweb.env.x86_64.2e50125951bc69cddd7421:latest swebench/sweb.eval.x86_64.django_1776_django-11039:latest
sweb.env.x86_64.2f217c8b4490bfa0e2ba14:latest swebench/sweb.eval.x86_64.django_1776_django-11049:latest
sweb.env.x86_64.31244378a92e3bcce809ac:latest swebench/sweb.eval.x86_64.django_1776_django-11099:latest
sweb.env.x86_64.428468730904ff6b4232aa:latest swebench/sweb.eval.x86_64.django_1776_django-11133:latest
sweb.env.x86_64.5d1fda9d55d65d8a4e5bdb:latest swebench/sweb.eval.x86_64.django_1776_django-11179:latest
sweb.env.x86_64.6b007979cf533f0f3016e8:latest swebench/sweb.eval.x86_64.django_1776_django-11283:latest
sweb.env.x86_64.7037e8c448a4b8ebfe9b13:latest swebench/sweb.eval.x86_64.django_1776_django-11422:latest
sweb.env.x86_64.71498c7426dbf05599642f:latest swebench/sweb.eval.x86_64.django_1776_django-11564:latest
sweb.env.x86_64.756beac07713d7e8dc1129:latest swebench/sweb.eval.x86_64.django_1776_django-11583:latest
sweb.env.x86_64.78278ae2cf880e395f1337:latest swebench/sweb.eval.x86_64.django_1776_django-11620:latest
sweb.env.x86_64.8f1f7b974f0c57c7aeba39:latest swebench/sweb.eval.x86_64.django_1776_django-11630:latest
sweb.env.x86_64.934a137824256b612e9dc5:latest swebench/sweb.eval.x86_64.django_1776_django-11742:latest
sweb.env.x86_64.a0efca7a0fe6719dbf65c2:latest swebench/sweb.eval.x86_64.django_1776_django-11797:latest
sweb.env.x86_64.a18371b03f944585b4f08c:latest swebench/sweb.eval.x86_64.django_1776_django-11815:latest
sweb.env.x86_64.a33dddf55cdff5d8e23374:latest swebench/sweb.eval.x86_64.django_1776_django-11848:latest
sweb.env.x86_64.aa92880033da20ca313928:latest swebench/sweb.eval.x86_64.django_1776_django-11905:latest
sweb.env.x86_64.b649f0ff62fad147f7f073:latest swebench/sweb.eval.x86_64.django_1776_django-11910:latest
sweb.env.x86_64.b7ce4be3b3c35f68c61248:latest swebench/sweb.eval.x86_64.django_1776_django-11964:latest
sweb.env.x86_64.c70909fdac4897d1c685df:latest swebench/sweb.eval.x86_64.django_1776_django-11999:latest
sweb.env.x86_64.c795f4b88616b8462021ed:latest swebench/sweb.eval.x86_64.django_1776_django-12113:latest
sweb.env.x86_64.cc47cc71483942d0c3a15e:latest swebench/sweb.eval.x86_64.django_1776_django-12125:latest
sweb.env.x86_64.dc5ff4c0e3fe8db5afc4da:latest swebench/sweb.eval.x86_64.django_1776_django-12184:latest
sweb.env.x86_64.e3afd7f04b325a4de4982d:latest swebench/sweb.eval.x86_64.django_1776_django-12284:latest
sweb.env.x86_64.e5bb89bf78258a7d14c34b:latest swebench/sweb.eval.x86_64.django_1776_django-12286:latest
sweb.env.x86_64.e83e37f52c09532c62acfb:latest swebench/sweb.eval.x86_64.django_1776_django-12308:latest
sweb.env.x86_64.efa6065ed5bf204410fd53:latest swebench/sweb.eval.x86_64.django_1776_django-12453:latest
sweb.eval.x86_64.astropy_s_astropy-12907:latest swebench/sweb.eval.x86_64.django_1776_django-12470:latest
sweb.eval.x86_64.astropy_s_astropy-14182:latest swebench/sweb.eval.x86_64.django_1776_django-12497:latest
sweb.eval.x86_64.astropy_s_astropy-14365:latest swebench/sweb.eval.x86_64.django_1776_django-12589:latest
sweb.eval.x86_64.astropy_s_astropy-14995:latest swebench/sweb.eval.x86_64.django_1776_django-12700:latest
sweb.eval.x86_64.astropy_s_astropy-6938:latest swebench/sweb.eval.x86_64.django_1776_django-12708:latest
sweb.eval.x86_64.astropy_s_astropy-7746:latest swebench/sweb.eval.x86_64.django_1776_django-12747:latest
sweb.eval.x86_64.django_s_django-10914:latest swebench/sweb.eval.x86_64.django_1776_django-12856:latest
sweb.eval.x86_64.django_s_django-10924:latest swebench/sweb.eval.x86_64.django_1776_django-12908:latest
sweb.eval.x86_64.django_s_django-11001:latest swebench/sweb.eval.x86_64.django_1776_django-12915:latest
sweb.eval.x86_64.django_s_django-11019:latest swebench/sweb.eval.x86_64.django_1776_django-12983:latest
sweb.eval.x86_64.django_s_django-11039:latest swebench/sweb.eval.x86_64.django_1776_django-13028:latest
sweb.eval.x86_64.django_s_django-11049:latest swebench/sweb.eval.x86_64.django_1776_django-13033:latest
sweb.eval.x86_64.django_s_django-11099:latest swebench/sweb.eval.x86_64.django_1776_django-13158:latest
sweb.eval.x86_64.django_s_django-11133:latest swebench/sweb.eval.x86_64.django_1776_django-13220:latest
sweb.eval.x86_64.django_s_django-11179:latest swebench/sweb.eval.x86_64.django_1776_django-13230:latest
sweb.eval.x86_64.django_s_django-11283:latest swebench/sweb.eval.x86_64.django_1776_django-13265:latest
sweb.eval.x86_64.django_s_django-11422:latest swebench/sweb.eval.x86_64.django_1776_django-13315:latest
sweb.eval.x86_64.django_s_django-11564:latest swebench/sweb.eval.x86_64.django_1776_django-13321:latest
sweb.eval.x86_64.django_s_django-11583:latest swebench/sweb.eval.x86_64.django_1776_django-13401:latest
sweb.eval.x86_64.django_s_django-11620:latest swebench/sweb.eval.x86_64.django_1776_django-13447:latest
sweb.eval.x86_64.django_s_django-11630:latest swebench/sweb.eval.x86_64.django_1776_django-13448:latest
sweb.eval.x86_64.django_s_django-11742:latest swebench/sweb.eval.x86_64.django_1776_django-13551:latest
sweb.eval.x86_64.django_s_django-11797:latest swebench/sweb.eval.x86_64.django_1776_django-13590:latest
sweb.eval.x86_64.django_s_django-11815:latest swebench/sweb.eval.x86_64.django_1776_django-13658:latest
sweb.eval.x86_64.django_s_django-11848:latest swebench/sweb.eval.x86_64.django_1776_django-13660:latest
sweb.eval.x86_64.django_s_django-11905:latest swebench/sweb.eval.x86_64.django_1776_django-13710:latest
sweb.eval.x86_64.django_s_django-11910:latest swebench/sweb.eval.x86_64.django_1776_django-13757:latest
sweb.eval.x86_64.django_s_django-11964:latest swebench/sweb.eval.x86_64.django_1776_django-13768:latest
sweb.eval.x86_64.django_s_django-11999:latest swebench/sweb.eval.x86_64.django_1776_django-13925:latest
sweb.eval.x86_64.django_s_django-12113:latest swebench/sweb.eval.x86_64.django_1776_django-13933:latest
sweb.eval.x86_64.django_s_django-12125:latest swebench/sweb.eval.x86_64.django_1776_django-13964:latest
sweb.eval.x86_64.django_s_django-12184:latest swebench/sweb.eval.x86_64.django_1776_django-14016:latest
sweb.eval.x86_64.django_s_django-12284:latest swebench/sweb.eval.x86_64.django_1776_django-14017:latest
sweb.eval.x86_64.django_s_django-12286:latest swebench/sweb.eval.x86_64.django_1776_django-14155:latest
sweb.eval.x86_64.django_s_django-12308:latest swebench/sweb.eval.x86_64.django_1776_django-14238:latest
sweb.eval.x86_64.django_s_django-12453:latest swebench/sweb.eval.x86_64.django_1776_django-14382:latest
sweb.eval.x86_64.django_s_django-12470:latest swebench/sweb.eval.x86_64.django_1776_django-14411:latest
sweb.eval.x86_64.django_s_django-12497:latest swebench/sweb.eval.x86_64.django_1776_django-14534:latest
sweb.eval.x86_64.django_s_django-12589:latest swebench/sweb.eval.x86_64.django_1776_django-14580:latest
sweb.eval.x86_64.django_s_django-12700:latest swebench/sweb.eval.x86_64.django_1776_django-14608:latest
sweb.eval.x86_64.django_s_django-12708:latest swebench/sweb.eval.x86_64.django_1776_django-14667:latest
sweb.eval.x86_64.django_s_django-12747:latest swebench/sweb.eval.x86_64.django_1776_django-14672:latest
sweb.eval.x86_64.django_s_django-12856:latest swebench/sweb.eval.x86_64.django_1776_django-14730:latest
sweb.eval.x86_64.django_s_django-12908:latest swebench/sweb.eval.x86_64.django_1776_django-14752:latest
sweb.eval.x86_64.django_s_django-12915:latest swebench/sweb.eval.x86_64.django_1776_django-14787:latest
sweb.eval.x86_64.django_s_django-12983:latest swebench/sweb.eval.x86_64.django_1776_django-14855:latest
sweb.eval.x86_64.django_s_django-13028:latest swebench/sweb.eval.x86_64.django_1776_django-14915:latest
sweb.eval.x86_64.django_s_django-13033:latest swebench/sweb.eval.x86_64.django_1776_django-14997:latest
sweb.eval.x86_64.django_s_django-13158:latest swebench/sweb.eval.x86_64.django_1776_django-14999:latest
sweb.eval.x86_64.django_s_django-13220:latest swebench/sweb.eval.x86_64.django_1776_django-15061:latest
sweb.eval.x86_64.django_s_django-13230:latest swebench/sweb.eval.x86_64.django_1776_django-15202:latest
sweb.eval.x86_64.django_s_django-13265:latest swebench/sweb.eval.x86_64.django_1776_django-15213:latest
sweb.eval.x86_64.django_s_django-13315:latest swebench/sweb.eval.x86_64.django_1776_django-15252:latest
sweb.eval.x86_64.django_s_django-13321:latest swebench/sweb.eval.x86_64.django_1776_django-15320:latest
sweb.eval.x86_64.django_s_django-13401:latest swebench/sweb.eval.x86_64.django_1776_django-15347:latest
sweb.eval.x86_64.django_s_django-13447:latest swebench/sweb.eval.x86_64.django_1776_django-15388:latest
sweb.eval.x86_64.django_s_django-13448:latest swebench/sweb.eval.x86_64.django_1776_django-15400:latest
sweb.eval.x86_64.django_s_django-13551:latest swebench/sweb.eval.x86_64.django_1776_django-15498:latest
sweb.eval.x86_64.django_s_django-13590:latest swebench/sweb.eval.x86_64.django_1776_django-15695:latest
sweb.eval.x86_64.django_s_django-13658:latest swebench/sweb.eval.x86_64.django_1776_django-15738:latest
sweb.eval.x86_64.django_s_django-13660:latest swebench/sweb.eval.x86_64.django_1776_django-15781:latest
sweb.eval.x86_64.django_s_django-13710:latest swebench/sweb.eval.x86_64.django_1776_django-15789:latest
sweb.eval.x86_64.django_s_django-13757:latest swebench/sweb.eval.x86_64.django_1776_django-15790:latest
sweb.eval.x86_64.django_s_django-13768:latest swebench/sweb.eval.x86_64.django_1776_django-15814:latest
sweb.eval.x86_64.django_s_django-13925:latest swebench/sweb.eval.x86_64.django_1776_django-15819:latest
sweb.eval.x86_64.django_s_django-13933:latest swebench/sweb.eval.x86_64.django_1776_django-15851:latest
sweb.eval.x86_64.django_s_django-13964:latest swebench/sweb.eval.x86_64.django_1776_django-15902:latest
sweb.eval.x86_64.django_s_django-14016:latest swebench/sweb.eval.x86_64.django_1776_django-15996:latest
sweb.eval.x86_64.django_s_django-14017:latest swebench/sweb.eval.x86_64.django_1776_django-16041:latest
sweb.eval.x86_64.django_s_django-14155:latest swebench/sweb.eval.x86_64.django_1776_django-16046:latest
sweb.eval.x86_64.django_s_django-14238:latest swebench/sweb.eval.x86_64.django_1776_django-16139:latest
sweb.eval.x86_64.django_s_django-14382:latest swebench/sweb.eval.x86_64.django_1776_django-16229:latest
sweb.eval.x86_64.django_s_django-14411:latest swebench/sweb.eval.x86_64.django_1776_django-16255:latest
sweb.eval.x86_64.django_s_django-14534:latest swebench/sweb.eval.x86_64.django_1776_django-16379:latest
sweb.eval.x86_64.django_s_django-14580:latest swebench/sweb.eval.x86_64.django_1776_django-16400:latest
sweb.eval.x86_64.django_s_django-14608:latest swebench/sweb.eval.x86_64.django_1776_django-16408:latest
sweb.eval.x86_64.django_s_django-14667:latest swebench/sweb.eval.x86_64.django_1776_django-16527:latest
sweb.eval.x86_64.django_s_django-14672:latest swebench/sweb.eval.x86_64.django_1776_django-16595:latest
sweb.eval.x86_64.django_s_django-14730:latest swebench/sweb.eval.x86_64.django_1776_django-16816:latest
sweb.eval.x86_64.django_s_django-14752:latest swebench/sweb.eval.x86_64.django_1776_django-16820:latest
sweb.eval.x86_64.django_s_django-14787:latest swebench/sweb.eval.x86_64.django_1776_django-16873:latest
sweb.eval.x86_64.django_s_django-14855:latest swebench/sweb.eval.x86_64.django_1776_django-16910:latest
sweb.eval.x86_64.django_s_django-14915:latest swebench/sweb.eval.x86_64.django_1776_django-17051:latest
sweb.eval.x86_64.django_s_django-14997:latest swebench/sweb.eval.x86_64.django_1776_django-17087:latest
sweb.eval.x86_64.django_s_django-14999:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-18869:latest
sweb.eval.x86_64.django_s_django-15061:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22711:latest
sweb.eval.x86_64.django_s_django-15202:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22835:latest
sweb.eval.x86_64.django_s_django-15213:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23299:latest
sweb.eval.x86_64.django_s_django-15252:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23314:latest
sweb.eval.x86_64.django_s_django-15320:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23476:latest
sweb.eval.x86_64.django_s_django-15347:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23562:latest
sweb.eval.x86_64.django_s_django-15388:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23563:latest
sweb.eval.x86_64.django_s_django-15400:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23913:latest
sweb.eval.x86_64.django_s_django-15498:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23964:latest
sweb.eval.x86_64.django_s_django-15695:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23987:latest
sweb.eval.x86_64.django_s_django-15738:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24149:latest
sweb.eval.x86_64.django_s_django-15781:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24265:latest
sweb.eval.x86_64.django_s_django-15789:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24334:latest
sweb.eval.x86_64.django_s_django-15790:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24970:latest
sweb.eval.x86_64.django_s_django-15814:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25079:latest
sweb.eval.x86_64.django_s_django-15819:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25311:latest
sweb.eval.x86_64.django_s_django-15851:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25332:latest
sweb.eval.x86_64.django_s_django-15902:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25433:latest
sweb.eval.x86_64.django_s_django-15996:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25442:latest
sweb.eval.x86_64.django_s_django-16041:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25498:latest
sweb.eval.x86_64.django_s_django-16046:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26011:latest
sweb.eval.x86_64.django_s_django-16139:latest swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26020:latest
sweb.eval.x86_64.django_s_django-16229:latest swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-2848:latest
sweb.eval.x86_64.django_s_django-16255:latest swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3010:latest
sweb.eval.x86_64.django_s_django-16379:latest swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3190:latest
sweb.eval.x86_64.django_s_django-16400:latest swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3407:latest
sweb.eval.x86_64.django_s_django-16408:latest swebench/sweb.eval.x86_64.pallets_1776_flask-4045:latest
sweb.eval.x86_64.django_s_django-16527:latest swebench/sweb.eval.x86_64.pallets_1776_flask-4992:latest
sweb.eval.x86_64.django_s_django-16595:latest swebench/sweb.eval.x86_64.pallets_1776_flask-5063:latest
sweb.eval.x86_64.django_s_django-16816:latest swebench/sweb.eval.x86_64.psf_1776_requests-1963:latest
sweb.eval.x86_64.django_s_django-16820:latest swebench/sweb.eval.x86_64.psf_1776_requests-2148:latest
sweb.eval.x86_64.django_s_django-16873:latest swebench/sweb.eval.x86_64.psf_1776_requests-2317:latest
sweb.eval.x86_64.django_s_django-16910:latest swebench/sweb.eval.x86_64.psf_1776_requests-2674:latest
sweb.eval.x86_64.django_s_django-17051:latest swebench/sweb.eval.x86_64.psf_1776_requests-3362:latest
sweb.eval.x86_64.django_s_django-17087:latest swebench/sweb.eval.x86_64.psf_1776_requests-863:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-18869:latest swebench/sweb.eval.x86_64.pydata_1776_xarray-3364:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-22711:latest swebench/sweb.eval.x86_64.pydata_1776_xarray-4094:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-22835:latest swebench/sweb.eval.x86_64.pydata_1776_xarray-4248:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23299:latest swebench/sweb.eval.x86_64.pydata_1776_xarray-4493:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23314:latest swebench/sweb.eval.x86_64.pydata_1776_xarray-5131:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23476:latest swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-5859:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23562:latest swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6506:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23563:latest swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7080:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23913:latest swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7114:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23964:latest swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7228:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23987:latest swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7993:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24149:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-11143:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24265:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-11148:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24334:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5103:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24970:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5221:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25079:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5227:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25311:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5413:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25332:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5495:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25433:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5692:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25442:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-6116:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25498:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7168:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-26011:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7220:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-26020:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7373:latest
sweb.eval.x86_64.mwaskom_s_seaborn-2848:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7432:latest
sweb.eval.x86_64.mwaskom_s_seaborn-3010:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7490:latest
sweb.eval.x86_64.mwaskom_s_seaborn-3190:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-8365:latest
sweb.eval.x86_64.mwaskom_s_seaborn-3407:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-8906:latest
sweb.eval.x86_64.pallets_s_flask-4045:latest swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-9359:latest
sweb.eval.x86_64.pallets_s_flask-4992:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10297:latest
sweb.eval.x86_64.pallets_s_flask-5063:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10508:latest
sweb.eval.x86_64.psf_s_requests-1963:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10949:latest
sweb.eval.x86_64.psf_s_requests-2148:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11040:latest
sweb.eval.x86_64.psf_s_requests-2317:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11281:latest
sweb.eval.x86_64.psf_s_requests-2674:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12471:latest
sweb.eval.x86_64.psf_s_requests-3362:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13142:latest
sweb.eval.x86_64.psf_s_requests-863:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13241:latest
sweb.eval.x86_64.pydata_s_xarray-3364:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13439:latest
sweb.eval.x86_64.pydata_s_xarray-4094:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13496:latest
sweb.eval.x86_64.pydata_s_xarray-4248:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13497:latest
sweb.eval.x86_64.pydata_s_xarray-4493:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13584:latest
sweb.eval.x86_64.pydata_s_xarray-5131:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13779:latest
sweb.eval.x86_64.pylint-dev_s_pylint-5859:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14087:latest
sweb.eval.x86_64.pylint-dev_s_pylint-6506:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14092:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7080:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14894:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7114:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14983:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7228:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-15512:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7993:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-15535:latest
sweb.eval.x86_64.pytest-dev_s_pytest-11143:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25500:latest
sweb.eval.x86_64.pytest-dev_s_pytest-11148:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25570:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5103:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25638:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5221:latest swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25747:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5227:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10325:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5413:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10451:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5495:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-11445:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5692:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7686:latest
sweb.eval.x86_64.pytest-dev_s_pytest-6116:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7738:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7168:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7975:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7220:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8273:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7373:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8282:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7432:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8435:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7490:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8474:latest
sweb.eval.x86_64.pytest-dev_s_pytest-8365:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8506:latest
sweb.eval.x86_64.pytest-dev_s_pytest-8906:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8595:latest
sweb.eval.x86_64.pytest-dev_s_pytest-9359:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8627:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10297:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8713:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10508:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8721:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10949:latest swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8801:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-11040:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-11400:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-11281:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-11870:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-12471:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-11897:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13142:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-12171:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13241:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-12236:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13439:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-12419:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13496:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-12454:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13497:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-12481:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13584:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13031:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13779:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13043:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14087:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13146:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14092:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13177:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14894:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13437:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14983:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13471:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-15512:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13480:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-15535:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13647:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25500:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13773:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25570:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13895:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25638:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13915:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25747:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-13971:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-10325:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-14024:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-10451:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-14308:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-11445:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-14317:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-7686:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-14396:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-7738:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-14774:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-7975:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-14817:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8273:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-15011:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8282:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-15308:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8435:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-15345:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8474:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-15346:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8506:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-15609:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8595:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-15678:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8627:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-16106:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8713:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-16281:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8721:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-16503:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8801:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-16792:latest
sweb.eval.x86_64.sympy_s_sympy-11400:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-16988:latest
sweb.eval.x86_64.sympy_s_sympy-11870:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-17022:latest
sweb.eval.x86_64.sympy_s_sympy-11897:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-17139:latest
sweb.eval.x86_64.sympy_s_sympy-12171:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-17630:latest
sweb.eval.x86_64.sympy_s_sympy-12236:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-17655:latest
sweb.eval.x86_64.sympy_s_sympy-12419:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18057:latest
sweb.eval.x86_64.sympy_s_sympy-12454:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18087:latest
sweb.eval.x86_64.sympy_s_sympy-12481:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
sweb.eval.x86_64.sympy_s_sympy-13031:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18199:latest
sweb.eval.x86_64.sympy_s_sympy-13043:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18532:latest
sweb.eval.x86_64.sympy_s_sympy-13146:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18621:latest
sweb.eval.x86_64.sympy_s_sympy-13177:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18698:latest
sweb.eval.x86_64.sympy_s_sympy-13437:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-18835:latest
sweb.eval.x86_64.sympy_s_sympy-13471:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-19007:latest
sweb.eval.x86_64.sympy_s_sympy-13480:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-19254:latest
sweb.eval.x86_64.sympy_s_sympy-13647:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-19487:latest
sweb.eval.x86_64.sympy_s_sympy-13773:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20049:latest
sweb.eval.x86_64.sympy_s_sympy-13895:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20154:latest
sweb.eval.x86_64.sympy_s_sympy-13915:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20212:latest
sweb.eval.x86_64.sympy_s_sympy-13971:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20322:latest
sweb.eval.x86_64.sympy_s_sympy-14024:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20442:latest
sweb.eval.x86_64.sympy_s_sympy-14308:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20590:latest
sweb.eval.x86_64.sympy_s_sympy-14317:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-20639:latest
sweb.eval.x86_64.sympy_s_sympy-14396:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21055:latest
sweb.eval.x86_64.sympy_s_sympy-14774:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21171:latest
sweb.eval.x86_64.sympy_s_sympy-14817:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21379:latest
sweb.eval.x86_64.sympy_s_sympy-15011:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21612:latest
sweb.eval.x86_64.sympy_s_sympy-15308:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21614:latest
sweb.eval.x86_64.sympy_s_sympy-15345:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21627:latest
sweb.eval.x86_64.sympy_s_sympy-15346:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-21847:latest
sweb.eval.x86_64.sympy_s_sympy-15609:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-22005:latest
sweb.eval.x86_64.sympy_s_sympy-15678:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-22714:latest
sweb.eval.x86_64.sympy_s_sympy-16106:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-22840:latest
sweb.eval.x86_64.sympy_s_sympy-16281:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-23117:latest
sweb.eval.x86_64.sympy_s_sympy-16503:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-23191:latest
sweb.eval.x86_64.sympy_s_sympy-16792:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-23262:latest
sweb.eval.x86_64.sympy_s_sympy-16988:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-24066:latest
sweb.eval.x86_64.sympy_s_sympy-17022:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-24102:latest
sweb.eval.x86_64.sympy_s_sympy-17139:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-24152:latest
sweb.eval.x86_64.sympy_s_sympy-17630:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-24213:latest
sweb.eval.x86_64.sympy_s_sympy-17655:latest swebench/sweb.eval.x86_64.sympy_1776_sympy-24909:latest
sweb.eval.x86_64.sympy_s_sympy-18057:latest
sweb.eval.x86_64.sympy_s_sympy-18087:latest
sweb.eval.x86_64.sympy_s_sympy-18189:latest
sweb.eval.x86_64.sympy_s_sympy-18199:latest
sweb.eval.x86_64.sympy_s_sympy-18532:latest
sweb.eval.x86_64.sympy_s_sympy-18621:latest
sweb.eval.x86_64.sympy_s_sympy-18698:latest
sweb.eval.x86_64.sympy_s_sympy-18835:latest
sweb.eval.x86_64.sympy_s_sympy-19007:latest
sweb.eval.x86_64.sympy_s_sympy-19254:latest
sweb.eval.x86_64.sympy_s_sympy-19487:latest
sweb.eval.x86_64.sympy_s_sympy-20049:latest
sweb.eval.x86_64.sympy_s_sympy-20154:latest
sweb.eval.x86_64.sympy_s_sympy-20212:latest
sweb.eval.x86_64.sympy_s_sympy-20322:latest
sweb.eval.x86_64.sympy_s_sympy-20442:latest
sweb.eval.x86_64.sympy_s_sympy-20590:latest
sweb.eval.x86_64.sympy_s_sympy-20639:latest
sweb.eval.x86_64.sympy_s_sympy-21055:latest
sweb.eval.x86_64.sympy_s_sympy-21171:latest
sweb.eval.x86_64.sympy_s_sympy-21379:latest
sweb.eval.x86_64.sympy_s_sympy-21612:latest
sweb.eval.x86_64.sympy_s_sympy-21614:latest
sweb.eval.x86_64.sympy_s_sympy-21627:latest
sweb.eval.x86_64.sympy_s_sympy-21847:latest
sweb.eval.x86_64.sympy_s_sympy-22005:latest
sweb.eval.x86_64.sympy_s_sympy-22714:latest
sweb.eval.x86_64.sympy_s_sympy-22840:latest
sweb.eval.x86_64.sympy_s_sympy-23117:latest
sweb.eval.x86_64.sympy_s_sympy-23191:latest
sweb.eval.x86_64.sympy_s_sympy-23262:latest
sweb.eval.x86_64.sympy_s_sympy-24066:latest
sweb.eval.x86_64.sympy_s_sympy-24102:latest
sweb.eval.x86_64.sympy_s_sympy-24152:latest
sweb.eval.x86_64.sympy_s_sympy-24213:latest
sweb.eval.x86_64.sympy_s_sympy-24909:latest

View File

@@ -0,0 +1,500 @@
swebench/sweb.eval.x86_64.astropy_1776_astropy-12907:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13033:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13236:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13398:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13453:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13579:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13977:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14096:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14182:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14309:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14365:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14369:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14508:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14539:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14598:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14995:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7166:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7336:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7606:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7671:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-8707:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-8872:latest
swebench/sweb.eval.x86_64.django_1776_django-10097:latest
swebench/sweb.eval.x86_64.django_1776_django-10554:latest
swebench/sweb.eval.x86_64.django_1776_django-10880:latest
swebench/sweb.eval.x86_64.django_1776_django-10914:latest
swebench/sweb.eval.x86_64.django_1776_django-10973:latest
swebench/sweb.eval.x86_64.django_1776_django-10999:latest
swebench/sweb.eval.x86_64.django_1776_django-11066:latest
swebench/sweb.eval.x86_64.django_1776_django-11087:latest
swebench/sweb.eval.x86_64.django_1776_django-11095:latest
swebench/sweb.eval.x86_64.django_1776_django-11099:latest
swebench/sweb.eval.x86_64.django_1776_django-11119:latest
swebench/sweb.eval.x86_64.django_1776_django-11133:latest
swebench/sweb.eval.x86_64.django_1776_django-11138:latest
swebench/sweb.eval.x86_64.django_1776_django-11141:latest
swebench/sweb.eval.x86_64.django_1776_django-11149:latest
swebench/sweb.eval.x86_64.django_1776_django-11163:latest
swebench/sweb.eval.x86_64.django_1776_django-11179:latest
swebench/sweb.eval.x86_64.django_1776_django-11206:latest
swebench/sweb.eval.x86_64.django_1776_django-11211:latest
swebench/sweb.eval.x86_64.django_1776_django-11239:latest
swebench/sweb.eval.x86_64.django_1776_django-11265:latest
swebench/sweb.eval.x86_64.django_1776_django-11276:latest
swebench/sweb.eval.x86_64.django_1776_django-11292:latest
swebench/sweb.eval.x86_64.django_1776_django-11299:latest
swebench/sweb.eval.x86_64.django_1776_django-11333:latest
swebench/sweb.eval.x86_64.django_1776_django-11400:latest
swebench/sweb.eval.x86_64.django_1776_django-11433:latest
swebench/sweb.eval.x86_64.django_1776_django-11451:latest
swebench/sweb.eval.x86_64.django_1776_django-11477:latest
swebench/sweb.eval.x86_64.django_1776_django-11490:latest
swebench/sweb.eval.x86_64.django_1776_django-11532:latest
swebench/sweb.eval.x86_64.django_1776_django-11551:latest
swebench/sweb.eval.x86_64.django_1776_django-11555:latest
swebench/sweb.eval.x86_64.django_1776_django-11603:latest
swebench/sweb.eval.x86_64.django_1776_django-11728:latest
swebench/sweb.eval.x86_64.django_1776_django-11734:latest
swebench/sweb.eval.x86_64.django_1776_django-11740:latest
swebench/sweb.eval.x86_64.django_1776_django-11749:latest
swebench/sweb.eval.x86_64.django_1776_django-11790:latest
swebench/sweb.eval.x86_64.django_1776_django-11815:latest
swebench/sweb.eval.x86_64.django_1776_django-11820:latest
swebench/sweb.eval.x86_64.django_1776_django-11848:latest
swebench/sweb.eval.x86_64.django_1776_django-11880:latest
swebench/sweb.eval.x86_64.django_1776_django-11885:latest
swebench/sweb.eval.x86_64.django_1776_django-11951:latest
swebench/sweb.eval.x86_64.django_1776_django-11964:latest
swebench/sweb.eval.x86_64.django_1776_django-11999:latest
swebench/sweb.eval.x86_64.django_1776_django-12039:latest
swebench/sweb.eval.x86_64.django_1776_django-12050:latest
swebench/sweb.eval.x86_64.django_1776_django-12125:latest
swebench/sweb.eval.x86_64.django_1776_django-12143:latest
swebench/sweb.eval.x86_64.django_1776_django-12155:latest
swebench/sweb.eval.x86_64.django_1776_django-12193:latest
swebench/sweb.eval.x86_64.django_1776_django-12209:latest
swebench/sweb.eval.x86_64.django_1776_django-12262:latest
swebench/sweb.eval.x86_64.django_1776_django-12273:latest
swebench/sweb.eval.x86_64.django_1776_django-12276:latest
swebench/sweb.eval.x86_64.django_1776_django-12304:latest
swebench/sweb.eval.x86_64.django_1776_django-12308:latest
swebench/sweb.eval.x86_64.django_1776_django-12325:latest
swebench/sweb.eval.x86_64.django_1776_django-12406:latest
swebench/sweb.eval.x86_64.django_1776_django-12419:latest
swebench/sweb.eval.x86_64.django_1776_django-12663:latest
swebench/sweb.eval.x86_64.django_1776_django-12708:latest
swebench/sweb.eval.x86_64.django_1776_django-12713:latest
swebench/sweb.eval.x86_64.django_1776_django-12741:latest
swebench/sweb.eval.x86_64.django_1776_django-12754:latest
swebench/sweb.eval.x86_64.django_1776_django-12774:latest
swebench/sweb.eval.x86_64.django_1776_django-12858:latest
swebench/sweb.eval.x86_64.django_1776_django-12965:latest
swebench/sweb.eval.x86_64.django_1776_django-13012:latest
swebench/sweb.eval.x86_64.django_1776_django-13023:latest
swebench/sweb.eval.x86_64.django_1776_django-13028:latest
swebench/sweb.eval.x86_64.django_1776_django-13033:latest
swebench/sweb.eval.x86_64.django_1776_django-13089:latest
swebench/sweb.eval.x86_64.django_1776_django-13109:latest
swebench/sweb.eval.x86_64.django_1776_django-13112:latest
swebench/sweb.eval.x86_64.django_1776_django-13121:latest
swebench/sweb.eval.x86_64.django_1776_django-13128:latest
swebench/sweb.eval.x86_64.django_1776_django-13158:latest
swebench/sweb.eval.x86_64.django_1776_django-13195:latest
swebench/sweb.eval.x86_64.django_1776_django-13212:latest
swebench/sweb.eval.x86_64.django_1776_django-13279:latest
swebench/sweb.eval.x86_64.django_1776_django-13297:latest
swebench/sweb.eval.x86_64.django_1776_django-13315:latest
swebench/sweb.eval.x86_64.django_1776_django-13343:latest
swebench/sweb.eval.x86_64.django_1776_django-13344:latest
swebench/sweb.eval.x86_64.django_1776_django-13346:latest
swebench/sweb.eval.x86_64.django_1776_django-13363:latest
swebench/sweb.eval.x86_64.django_1776_django-13401:latest
swebench/sweb.eval.x86_64.django_1776_django-13406:latest
swebench/sweb.eval.x86_64.django_1776_django-13410:latest
swebench/sweb.eval.x86_64.django_1776_django-13417:latest
swebench/sweb.eval.x86_64.django_1776_django-13449:latest
swebench/sweb.eval.x86_64.django_1776_django-13512:latest
swebench/sweb.eval.x86_64.django_1776_django-13513:latest
swebench/sweb.eval.x86_64.django_1776_django-13516:latest
swebench/sweb.eval.x86_64.django_1776_django-13551:latest
swebench/sweb.eval.x86_64.django_1776_django-13568:latest
swebench/sweb.eval.x86_64.django_1776_django-13569:latest
swebench/sweb.eval.x86_64.django_1776_django-13590:latest
swebench/sweb.eval.x86_64.django_1776_django-13658:latest
swebench/sweb.eval.x86_64.django_1776_django-13670:latest
swebench/sweb.eval.x86_64.django_1776_django-13741:latest
swebench/sweb.eval.x86_64.django_1776_django-13786:latest
swebench/sweb.eval.x86_64.django_1776_django-13794:latest
swebench/sweb.eval.x86_64.django_1776_django-13807:latest
swebench/sweb.eval.x86_64.django_1776_django-13809:latest
swebench/sweb.eval.x86_64.django_1776_django-13810:latest
swebench/sweb.eval.x86_64.django_1776_django-13820:latest
swebench/sweb.eval.x86_64.django_1776_django-13821:latest
swebench/sweb.eval.x86_64.django_1776_django-13837:latest
swebench/sweb.eval.x86_64.django_1776_django-13925:latest
swebench/sweb.eval.x86_64.django_1776_django-13933:latest
swebench/sweb.eval.x86_64.django_1776_django-13964:latest
swebench/sweb.eval.x86_64.django_1776_django-14007:latest
swebench/sweb.eval.x86_64.django_1776_django-14011:latest
swebench/sweb.eval.x86_64.django_1776_django-14017:latest
swebench/sweb.eval.x86_64.django_1776_django-14034:latest
swebench/sweb.eval.x86_64.django_1776_django-14053:latest
swebench/sweb.eval.x86_64.django_1776_django-14089:latest
swebench/sweb.eval.x86_64.django_1776_django-14122:latest
swebench/sweb.eval.x86_64.django_1776_django-14140:latest
swebench/sweb.eval.x86_64.django_1776_django-14155:latest
swebench/sweb.eval.x86_64.django_1776_django-14170:latest
swebench/sweb.eval.x86_64.django_1776_django-14238:latest
swebench/sweb.eval.x86_64.django_1776_django-14311:latest
swebench/sweb.eval.x86_64.django_1776_django-14315:latest
swebench/sweb.eval.x86_64.django_1776_django-14349:latest
swebench/sweb.eval.x86_64.django_1776_django-14351:latest
swebench/sweb.eval.x86_64.django_1776_django-14373:latest
swebench/sweb.eval.x86_64.django_1776_django-14376:latest
swebench/sweb.eval.x86_64.django_1776_django-14404:latest
swebench/sweb.eval.x86_64.django_1776_django-14434:latest
swebench/sweb.eval.x86_64.django_1776_django-14493:latest
swebench/sweb.eval.x86_64.django_1776_django-14500:latest
swebench/sweb.eval.x86_64.django_1776_django-14534:latest
swebench/sweb.eval.x86_64.django_1776_django-14539:latest
swebench/sweb.eval.x86_64.django_1776_django-14559:latest
swebench/sweb.eval.x86_64.django_1776_django-14580:latest
swebench/sweb.eval.x86_64.django_1776_django-14608:latest
swebench/sweb.eval.x86_64.django_1776_django-14631:latest
swebench/sweb.eval.x86_64.django_1776_django-14672:latest
swebench/sweb.eval.x86_64.django_1776_django-14725:latest
swebench/sweb.eval.x86_64.django_1776_django-14752:latest
swebench/sweb.eval.x86_64.django_1776_django-14765:latest
swebench/sweb.eval.x86_64.django_1776_django-14771:latest
swebench/sweb.eval.x86_64.django_1776_django-14787:latest
swebench/sweb.eval.x86_64.django_1776_django-14792:latest
swebench/sweb.eval.x86_64.django_1776_django-14855:latest
swebench/sweb.eval.x86_64.django_1776_django-14915:latest
swebench/sweb.eval.x86_64.django_1776_django-14999:latest
swebench/sweb.eval.x86_64.django_1776_django-15022:latest
swebench/sweb.eval.x86_64.django_1776_django-15037:latest
swebench/sweb.eval.x86_64.django_1776_django-15098:latest
swebench/sweb.eval.x86_64.django_1776_django-15103:latest
swebench/sweb.eval.x86_64.django_1776_django-15104:latest
swebench/sweb.eval.x86_64.django_1776_django-15127:latest
swebench/sweb.eval.x86_64.django_1776_django-15128:latest
swebench/sweb.eval.x86_64.django_1776_django-15161:latest
swebench/sweb.eval.x86_64.django_1776_django-15252:latest
swebench/sweb.eval.x86_64.django_1776_django-15268:latest
swebench/sweb.eval.x86_64.django_1776_django-15277:latest
swebench/sweb.eval.x86_64.django_1776_django-15278:latest
swebench/sweb.eval.x86_64.django_1776_django-15280:latest
swebench/sweb.eval.x86_64.django_1776_django-15315:latest
swebench/sweb.eval.x86_64.django_1776_django-15368:latest
swebench/sweb.eval.x86_64.django_1776_django-15375:latest
swebench/sweb.eval.x86_64.django_1776_django-15380:latest
swebench/sweb.eval.x86_64.django_1776_django-15382:latest
swebench/sweb.eval.x86_64.django_1776_django-15467:latest
swebench/sweb.eval.x86_64.django_1776_django-15499:latest
swebench/sweb.eval.x86_64.django_1776_django-15503:latest
swebench/sweb.eval.x86_64.django_1776_django-15525:latest
swebench/sweb.eval.x86_64.django_1776_django-15554:latest
swebench/sweb.eval.x86_64.django_1776_django-15561:latest
swebench/sweb.eval.x86_64.django_1776_django-15563:latest
swebench/sweb.eval.x86_64.django_1776_django-15569:latest
swebench/sweb.eval.x86_64.django_1776_django-15572:latest
swebench/sweb.eval.x86_64.django_1776_django-15629:latest
swebench/sweb.eval.x86_64.django_1776_django-15695:latest
swebench/sweb.eval.x86_64.django_1776_django-15731:latest
swebench/sweb.eval.x86_64.django_1776_django-15732:latest
swebench/sweb.eval.x86_64.django_1776_django-15741:latest
swebench/sweb.eval.x86_64.django_1776_django-15814:latest
swebench/sweb.eval.x86_64.django_1776_django-15851:latest
swebench/sweb.eval.x86_64.django_1776_django-15863:latest
swebench/sweb.eval.x86_64.django_1776_django-15916:latest
swebench/sweb.eval.x86_64.django_1776_django-15930:latest
swebench/sweb.eval.x86_64.django_1776_django-15957:latest
swebench/sweb.eval.x86_64.django_1776_django-15973:latest
swebench/sweb.eval.x86_64.django_1776_django-15987:latest
swebench/sweb.eval.x86_64.django_1776_django-16032:latest
swebench/sweb.eval.x86_64.django_1776_django-16082:latest
swebench/sweb.eval.x86_64.django_1776_django-16100:latest
swebench/sweb.eval.x86_64.django_1776_django-16116:latest
swebench/sweb.eval.x86_64.django_1776_django-16136:latest
swebench/sweb.eval.x86_64.django_1776_django-16139:latest
swebench/sweb.eval.x86_64.django_1776_django-16145:latest
swebench/sweb.eval.x86_64.django_1776_django-16255:latest
swebench/sweb.eval.x86_64.django_1776_django-16256:latest
swebench/sweb.eval.x86_64.django_1776_django-16263:latest
swebench/sweb.eval.x86_64.django_1776_django-16315:latest
swebench/sweb.eval.x86_64.django_1776_django-16333:latest
swebench/sweb.eval.x86_64.django_1776_django-16429:latest
swebench/sweb.eval.x86_64.django_1776_django-16454:latest
swebench/sweb.eval.x86_64.django_1776_django-16485:latest
swebench/sweb.eval.x86_64.django_1776_django-16493:latest
swebench/sweb.eval.x86_64.django_1776_django-16502:latest
swebench/sweb.eval.x86_64.django_1776_django-16527:latest
swebench/sweb.eval.x86_64.django_1776_django-16560:latest
swebench/sweb.eval.x86_64.django_1776_django-16569:latest
swebench/sweb.eval.x86_64.django_1776_django-16595:latest
swebench/sweb.eval.x86_64.django_1776_django-16612:latest
swebench/sweb.eval.x86_64.django_1776_django-16631:latest
swebench/sweb.eval.x86_64.django_1776_django-16642:latest
swebench/sweb.eval.x86_64.django_1776_django-16661:latest
swebench/sweb.eval.x86_64.django_1776_django-16662:latest
swebench/sweb.eval.x86_64.django_1776_django-16667:latest
swebench/sweb.eval.x86_64.django_1776_django-16801:latest
swebench/sweb.eval.x86_64.django_1776_django-16819:latest
swebench/sweb.eval.x86_64.django_1776_django-16877:latest
swebench/sweb.eval.x86_64.django_1776_django-16899:latest
swebench/sweb.eval.x86_64.django_1776_django-16901:latest
swebench/sweb.eval.x86_64.django_1776_django-16938:latest
swebench/sweb.eval.x86_64.django_1776_django-16950:latest
swebench/sweb.eval.x86_64.django_1776_django-17029:latest
swebench/sweb.eval.x86_64.django_1776_django-17084:latest
swebench/sweb.eval.x86_64.django_1776_django-17087:latest
swebench/sweb.eval.x86_64.django_1776_django-7530:latest
swebench/sweb.eval.x86_64.django_1776_django-9296:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-13989:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-14623:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20488:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20676:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20826:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20859:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-21568:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22719:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22865:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22871:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23299:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23314:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23412:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23476:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24026:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24149:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24177:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24570:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24627:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24637:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24870:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24970:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25122:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25287:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25311:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25332:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25479:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25775:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25960:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26113:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26208:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26291:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26342:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26466:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3069:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3187:latest
swebench/sweb.eval.x86_64.pallets_1776_flask-5014:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1142:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1724:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1766:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1921:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2317:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2931:latest
swebench/sweb.eval.x86_64.psf_1776_requests-5414:latest
swebench/sweb.eval.x86_64.psf_1776_requests-6028:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-2905:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3095:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3151:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3305:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3677:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3993:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4075:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4094:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4356:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4629:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4687:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4695:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4966:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6461:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6599:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6721:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6744:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6938:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6992:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-7229:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-7233:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-7393:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4551:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4604:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4661:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4970:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6386:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6528:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6903:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7080:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7277:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-8898:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-10051:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-10081:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-10356:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5262:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5631:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5787:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5809:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5840:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-6197:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-6202:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7205:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7236:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7324:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7432:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7490:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7521:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7571:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7982:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-8399:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10297:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10844:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10908:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11310:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11578:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12585:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12682:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12973:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13124:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13135:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13142:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13328:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13439:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13496:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13779:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14053:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14087:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14141:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14496:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14629:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14710:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14894:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14983:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-15100:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25102:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25232:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25747:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25931:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25973:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-26194:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-26323:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-9288:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10323:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10435:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10449:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10466:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10614:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10673:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-11445:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-11510:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7440:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7454:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7462:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7590:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7748:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7757:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7889:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7910:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7985:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8035:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8056:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8120:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8265:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8269:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8459:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8475:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8548:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8551:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8593:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8595:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8621:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8638:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8721:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9229:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9230:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9258:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9281:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9320:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9367:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9461:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9591:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9602:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9658:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9673:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9698:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9711:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-11618:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12096:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12419:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12481:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12489:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13031:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13091:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13372:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13480:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13551:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13615:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13647:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13757:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13798:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13852:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13877:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13878:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13974:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14248:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14531:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14711:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14976:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15017:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15345:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15349:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15599:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15809:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15875:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15976:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16450:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16597:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16766:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16792:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16886:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17139:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17318:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17630:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17655:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18199:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18211:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18698:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18763:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19040:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19346:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19495:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19637:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19783:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19954:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20154:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20428:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20438:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20590:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20801:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20916:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21379:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21596:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21612:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21847:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21930:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22080:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22456:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22714:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22914:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23262:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23413:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23534:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23824:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23950:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24066:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24213:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24443:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24539:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24562:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24661:latest

View File

@@ -0,0 +1,50 @@
"""Get official docker image names for SWE-bench instances."""
import argparse
from datasets import load_dataset
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench')
parser.add_argument('--split', type=str, default='test')
parser.add_argument('--output', type=str, default='swebench_images.txt')
args = parser.parse_args()
SUPPORTED_DATASET = {
'princeton-nlp/SWE-bench_Multimodal',
'princeton-nlp/SWE-bench',
'princeton-nlp/SWE-bench_Lite',
'princeton-nlp/SWE-bench_Verified',
}
assert args.dataset in SUPPORTED_DATASET, f'Dataset {args.dataset} not supported'
def swebench_instance_id_to_docker_image_name(instance_id: str) -> str:
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
repo, name = instance_id.split('__')
return f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
def swebench_multimodal_instance_id_to_docker_image_name(instance_id: str) -> str:
# swebench/sweb.mm.eval.x86_64.openlayers_1776_openlayers-12172
repo, name = instance_id.split('__')
return f'swebench/sweb.mm.eval.x86_64.{repo}_1776_{name}:latest'
dataset = load_dataset(args.dataset, split=args.split)
instance_ids = dataset['instance_id']
print(f'Loading {len(instance_ids)} instances from {args.dataset} split {args.split}')
with open(args.output, 'w') as f:
for instance_id in instance_ids:
if args.dataset in [
'princeton-nlp/SWE-bench',
'princeton-nlp/SWE-bench_Lite',
'princeton-nlp/SWE-bench_Verified',
]:
f.write(swebench_instance_id_to_docker_image_name(instance_id) + '\n')
else:
f.write(
swebench_multimodal_instance_id_to_docker_image_name(instance_id) + '\n'
)
print(f'Saved {len(instance_ids)} images to {args.output}')

View File

@@ -1,66 +1,36 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -e set -e
LEVEL=$1 SET=$1
# three levels: # check set is in ["full", "lite", "verified"]
# - base, keyword "sweb.base" if [ "$SET" != "full" ] && [ "$SET" != "lite" ] && [ "$SET" != "verified" ]; then
# - env, keyword "sweb.env" echo "Error: argument 1 must be one of: full, lite, verified"
# - instance, keyword "sweb.eval"
SET=$2
if [ -z "$LEVEL" ]; then
echo "Usage: $0 <cache_level> <set>"
echo "cache_level: base, env, or instance"
echo "set: lite, full"
exit 1 exit 1
fi fi
if [ -z "$SET" ]; then input_file=evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-${SET}-instance-images.txt
echo "Usage: $0 <cache_level> <set>" echo "Downloading images based on ${input_file}"
echo "cache_level: base, env, or instance" # Check if the file exists
echo "set: lite, full, default is lite" if [ ! -f "$input_file" ]; then
SET="lite" echo "Error: File '$input_file' not found"
exit 1
fi fi
# Check if namespace is provided via argument $3, otherwise default to 'xingyaoww' # Get total number of images
NAMESPACE=${3:-xingyaoww} total_images=$(wc -l < "${input_file}")
counter=0
echo "Using namespace: $NAMESPACE" echo "Starting to pull ${total_images} images"
if [ "$SET" == "lite" ]; then # Read the file line by line and pull each image
IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt" while IFS= read -r image; do
else # Skip empty lines or comments
IMAGE_FILE="$(dirname "$0")/all-swebench-full-instance-images.txt" if [ -n "$image" ] && [[ ! "$image" =~ ^[[:space:]]*# ]]; then
fi counter=$((counter + 1))
echo "[${counter}/${total_images}] Pulling ${image}"
docker pull "${image}"
sleep 2
fi
done < "${input_file}"
# Define a pattern based on the level echo "Finished pulling all images"
case $LEVEL in
base)
PATTERN="sweb.base"
;;
env)
PATTERN="sweb.base\|sweb.env"
;;
instance)
PATTERN="sweb.base\|sweb.env\|sweb.eval"
;;
*)
echo "Invalid cache level: $LEVEL"
echo "Valid levels are: base, env, instance"
exit 1
;;
esac
echo "Pulling docker images for [$LEVEL] level"
echo "Pattern: $PATTERN"
echo "Image file: $IMAGE_FILE"
# Read each line from the file, filter by pattern, and pull the docker image
grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
echo "Pulling $NAMESPACE/$image into $image"
docker pull $NAMESPACE/$image
# replace _s_ to __ in the image name
renamed_image=$(echo "$image" | sed 's/_s_/__/g')
docker tag $NAMESPACE/$image $renamed_image
done

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import os import os
import subprocess
import pandas as pd import pandas as pd
from termcolor import colored from termcolor import colored
@@ -35,6 +36,23 @@ if args.only_x_instances:
f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances' f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
) )
# Add summarization step for each input file
def summarize_file(file_path):
script_dir = os.path.dirname(os.path.abspath(__file__))
summarize_script = os.path.join(script_dir, 'summarize_outputs.py')
print(f'\nSummary for {file_path}:')
print('=' * 80)
subprocess.run(['python', summarize_script, file_path], check=True)
print('=' * 80)
# Generate summaries
summarize_file(args.input_file_1)
summarize_file(args.input_file_2)
# Get the intersection of the instance_ids # Get the intersection of the instance_ids
df = pd.merge(df1, df2, on='instance_id', how='inner') df = pd.merge(df1, df2, on='instance_id', how='inner')

View File

@@ -248,6 +248,22 @@ def write_row_to_md_file(row, instance_id_to_test_result):
completions = load_completions(instance_id) completions = load_completions(instance_id)
# report file
global output_dir
report_file = os.path.join(output_dir, 'eval_outputs', instance_id, 'report.json')
if os.path.exists(report_file):
with open(report_file, 'r') as f:
report = json.load(f)
else:
report = None
test_output_file = os.path.join(
output_dir, 'eval_outputs', instance_id, 'test_output.txt'
)
if test_output is None and os.path.exists(test_output_file):
with open(test_output_file, 'r') as f:
test_output = f.read()
with open(filepath, 'w') as f: with open(filepath, 'w') as f:
f.write(f'# {instance_id} (resolved: {resolved})\n') f.write(f'# {instance_id} (resolved: {resolved})\n')
@@ -269,8 +285,14 @@ def write_row_to_md_file(row, instance_id_to_test_result):
f.write('## Model Patch\n') f.write('## Model Patch\n')
f.write(f'{process_git_patch(model_patch)}\n') f.write(f'{process_git_patch(model_patch)}\n')
if report is not None:
f.write('## Report\n')
f.write(json.dumps(report, indent=2))
f.write('\n')
f.write('## Test Output\n') f.write('## Test Output\n')
f.write(str(test_output)) f.write(str(test_output))
f.write('\n')
instance_id_to_test_result = {} instance_id_to_test_result = {}

View File

@@ -44,7 +44,6 @@ if os.path.exists(swebench_official_report_json):
f"- resolved instances: {report['resolved_instances']}\n" f"- resolved instances: {report['resolved_instances']}\n"
f"- unresolved instances: {report['unresolved_instances']}\n" f"- unresolved instances: {report['unresolved_instances']}\n"
f"- error instances: {report['error_instances']}\n" f"- error instances: {report['error_instances']}\n"
f"- unstopped instances: {report['unstopped_instances']}\n"
) )
output_md += '\n## Resolved Instances\n' output_md += '\n## Resolved Instances\n'

View File

@@ -247,11 +247,21 @@ def prepare_dataset(
f'Starting evaluation with skipping first {skip_num} instances ({len(dataset)} instances to run).' f'Starting evaluation with skipping first {skip_num} instances ({len(dataset)} instances to run).'
) )
if eval_n_limit and eval_n_limit > 0: if eval_n_limit and eval_n_limit > 0:
dataset = dataset.head(eval_n_limit) # Use fixed random seed 42 for sampling without replacement
logger.info(f'Limiting evaluation to {eval_n_limit} instances.') dataset = dataset.sample(
min(eval_n_limit, len(dataset)), random_state=42, replace=False
)
logger.info(
f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
)
elif eval_n_limit and eval_n_limit > 0: elif eval_n_limit and eval_n_limit > 0:
dataset = dataset.head(eval_n_limit) # Use fixed random seed 42 for sampling without replacement
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.') dataset = dataset.sample(
min(eval_n_limit, len(dataset)), random_state=42, replace=False
)
logger.info(
f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
)
new_dataset = [ new_dataset = [
instance instance

21
poetry.lock generated
View File

@@ -9106,13 +9106,15 @@ files = [
[[package]] [[package]]
name = "swebench" name = "swebench"
version = "2.0.13" version = "3.0.8"
description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering" description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["evaluation"] groups = ["evaluation"]
files = [] files = [
develop = false {file = "swebench-3.0.8-py3-none-any.whl", hash = "sha256:daea564215dc77fc27998405a68e7b40880d25ed408813fe0ccd890bcc249a02"},
{file = "swebench-3.0.8.tar.gz", hash = "sha256:f86f8412690c808592b3accb20c018f9cf480dbafa21525e065a138dd06b6e1f"},
]
[package.dependencies] [package.dependencies]
beautifulsoup4 = "*" beautifulsoup4 = "*"
@@ -9121,21 +9123,18 @@ datasets = "*"
docker = "*" docker = "*"
ghapi = "*" ghapi = "*"
GitPython = "*" GitPython = "*"
modal = "*"
pre-commit = "*" pre-commit = "*"
python-dotenv = "*" python-dotenv = "*"
requests = "*" requests = "*"
rich = "*" rich = "*"
tenacity = "*"
tqdm = "*" tqdm = "*"
unidiff = "*" unidiff = "*"
[package.extras] [package.extras]
inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"] inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tiktoken", "torch", "transformers", "triton"]
test = ["pytest", "pytest-cov"]
[package.source]
type = "git"
url = "https://github.com/All-Hands-AI/SWE-bench.git"
reference = "HEAD"
resolved_reference = "c807c112edc3dcb4fdf5ddac63b34706912d5cdb"
[[package]] [[package]]
name = "sympy" name = "sympy"
@@ -10853,4 +10852,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = "^3.12" python-versions = "^3.12"
content-hash = "6162482b9821778fed90d6cf4e252f90bf4dd70a44f44295837297d24f440138" content-hash = "0aa5dc28564265aa19b0c90e6f65cd2b086a373ecdaa5c521542aa19d3c84ecf"

View File

@@ -144,7 +144,7 @@ streamlit = "*"
whatthepatch = "*" whatthepatch = "*"
retry = "*" retry = "*"
evaluate = "*" evaluate = "*"
swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" } swebench = "^3.0.8"
commit0 = "*" commit0 = "*"
func_timeout = "*" func_timeout = "*"
sympy = "*" sympy = "*"