[eval] SWE-Gym Integration (#6651)

Co-authored-by: Robert Brennan <accounts@rbren.io>
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang
2025-03-05 15:15:02 -05:00
committed by GitHub
parent bbf40c6576
commit 9f720a9d69
10 changed files with 638 additions and 20 deletions

45
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -8938,7 +8938,7 @@ files = [
[package.dependencies]
greenlet = [
{version = "!=0.4.17", optional = true, markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") or extra == \"asyncio\""},
{version = "!=0.4.17", markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"},
{version = "!=0.4.17", optional = true, markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") or extra == \"asyncio\""},
]
typing-extensions = ">=4.6.0"
@@ -9109,14 +9109,14 @@ files = [
[[package]]
name = "swebench"
version = "3.0.13"
version = "3.0.15"
description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
optional = false
python-versions = ">=3.8"
groups = ["evaluation"]
files = [
{file = "swebench-3.0.13-py3-none-any.whl", hash = "sha256:0949e0a7269fcebb287dd951d14c049bd8189c7740fc4878354dbec756531c0f"},
{file = "swebench-3.0.13.tar.gz", hash = "sha256:d1cce406d0674cb1f3ca7da90089644d1ded3649c98f239a5a7ef4829d2f7c58"},
{file = "swebench-3.0.15-py3-none-any.whl", hash = "sha256:dd694356f9c155a55d3d2e113fe58446f7385eea0574230af5e2504426f8b85b"},
{file = "swebench-3.0.15.tar.gz", hash = "sha256:24e734fbcce34082665a25719075e6899382b7135103dd8c6cc09a6e23789101"},
]
[package.dependencies]
@@ -9139,6 +9139,39 @@ unidiff = "*"
inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tiktoken", "torch", "transformers", "triton"]
test = ["pytest", "pytest-cov"]
[[package]]
name = "swegym"
version = "2.0.13"
description = "Fork of SWE-bench package - a benchmark for evaluating LMs on software engineering"
optional = false
python-versions = ">=3.8"
groups = ["evaluation"]
files = []
develop = false
[package.dependencies]
beautifulsoup4 = "*"
chardet = "*"
datasets = "*"
docker = "*"
ghapi = "*"
GitPython = "*"
pre-commit = "*"
python-dotenv = "*"
requests = "*"
rich = "*"
tqdm = "*"
unidiff = "*"
[package.extras]
inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"]
[package.source]
type = "git"
url = "https://github.com/SWE-Gym/SWE-Bench-Package.git"
reference = "HEAD"
resolved_reference = "16dd480cce9b27bf111a362d280881c6def5d2a7"
[[package]]
name = "sympy"
version = "1.13.1"
@@ -10855,4 +10888,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12"
content-hash = "83da0b681253a79417c9842862cdd102c1ab6e8770d9dd9e0c42bc7994be2cd0"
content-hash = "c3f32c54606e5f313d9a909625f77cc3d575bf951e986633bcecd94520f36450"