[eval] SWE-Gym Integration (#6651)

Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
2026-01-09 14:57:59 -05:00 · 2025-03-05 15:15:02 -05:00
parent bbf40c6576
commit 9f720a9d69
10 changed files with 638 additions and 20 deletions
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -8938,7 +8938,7 @@ files = [

 [package.dependencies]
 greenlet = [
-    {version = "!=0.4.17", optional = true, markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") or extra == \"asyncio\""},
+    {version = "!=0.4.17", markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"},
    {version = "!=0.4.17", optional = true, markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") or extra == \"asyncio\""},
 ]
 typing-extensions = ">=4.6.0"
@@ -9109,14 +9109,14 @@ files = [

 [[package]]
 name = "swebench"
-version = "3.0.13"
+version = "3.0.15"
 description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
 optional = false
 python-versions = ">=3.8"
 groups = ["evaluation"]
 files = [
-    {file = "swebench-3.0.13-py3-none-any.whl", hash = "sha256:0949e0a7269fcebb287dd951d14c049bd8189c7740fc4878354dbec756531c0f"},
-    {file = "swebench-3.0.13.tar.gz", hash = "sha256:d1cce406d0674cb1f3ca7da90089644d1ded3649c98f239a5a7ef4829d2f7c58"},
+    {file = "swebench-3.0.15-py3-none-any.whl", hash = "sha256:dd694356f9c155a55d3d2e113fe58446f7385eea0574230af5e2504426f8b85b"},
+    {file = "swebench-3.0.15.tar.gz", hash = "sha256:24e734fbcce34082665a25719075e6899382b7135103dd8c6cc09a6e23789101"},
 ]

 [package.dependencies]
@@ -9139,6 +9139,39 @@ unidiff = "*"
 inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tiktoken", "torch", "transformers", "triton"]
 test = ["pytest", "pytest-cov"]

+[[package]]
+name = "swegym"
+version = "2.0.13"
+description = "Fork of SWE-bench package - a benchmark for evaluating LMs on software engineering"
+optional = false
+python-versions = ">=3.8"
+groups = ["evaluation"]
+files = []
+develop = false
+
+[package.dependencies]
+beautifulsoup4 = "*"
+chardet = "*"
+datasets = "*"
+docker = "*"
+ghapi = "*"
+GitPython = "*"
+pre-commit = "*"
+python-dotenv = "*"
+requests = "*"
+rich = "*"
+tqdm = "*"
+unidiff = "*"
+
+[package.extras]
+inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"]
+
+[package.source]
+type = "git"
+url = "https://github.com/SWE-Gym/SWE-Bench-Package.git"
+reference = "HEAD"
+resolved_reference = "16dd480cce9b27bf111a362d280881c6def5d2a7"
+
 [[package]]
 name = "sympy"
 version = "1.13.1"
@@ -10855,4 +10888,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "83da0b681253a79417c9842862cdd102c1ab6e8770d9dd9e0c42bc7994be2cd0"
+content-hash = "c3f32c54606e5f313d9a909625f77cc3d575bf951e986633bcecd94520f36450"