mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 07:18:10 -05:00
* add ml-bench w/o exec env * fix typos (#1956) no functional change * Refactored Logs (#1939) * [Feat] A competitive Web Browsing agent (#1856) * initial attempt at a browsing only agent * add browsing agent * update * implement agent * update * fix comments * remove unnecessary things from memory extras * update image processing --------- Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Update README.md SWE-bench score (#1959) * Update README.md SWE-bench score Our most recent results on swe-bench lite are 25%, so this updates the README accordingly. * Update * fix: llm is_local function logic error (#1961) Co-authored-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com> * doc: update documentation about poetry update (#1962) * add doc * Update Development.md --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * feat: add metrics related to cost for better observability (#1944) * add metrics for total_cost * make lint * refact codeact * change metrics into llm * add costs list, add into state * refactor log completion * refactor and test others * make lint * Update opendevin/core/metrics.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update opendevin/llm/llm.py Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> * refactor * add code --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> * doc: add more cmd in unit test documentation (#1963) * --- (#1975) updated-dependencies: - dependency-name: boto3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * --- (#1976) updated-dependencies: - dependency-name: litellm dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Logging security (#1943) * update .gitignore * Rename the confusing 'INFO' style to 'DETAIL' * override str and repr * feat: api_key desensitize * feat: add SensitiveDataFilter in file handler * tweak regex, add tests * more tweaks, include other attrs * add env vars, those with equivalent config * fix tests * tests are invaluable --------- Co-authored-by: Shimada666 <649940882@qq.com> * --- (#1967) updated-dependencies: - dependency-name: react-dom dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: "@types/react-dom" dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * --- (#1968) updated-dependencies: - dependency-name: "@reduxjs/toolkit" dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * --- (#1969) updated-dependencies: - dependency-name: husky dependency-type: direct:development update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * --- (#1970) updated-dependencies: - dependency-name: tailwind-merge dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * --- (#1971) updated-dependencies: - dependency-name: i18next dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * Refactor session management (#1810) * refactor session mgmt * defer file handling to runtime * add todo * refactor sessions a bit more * remove messages logic from FE * fix up socket handshake * refactor frontend auth a bit * first pass at redoing file explorer * implement directory suffix * fix up file tree * close agent on websocket close * remove session saving * move file refresh * remove getWorkspace * plumb path/code differently * fix build issues * fix the tests * fix npm build * add session rehydration * fix event serialization * logspam * fix user message rehydration * add get_event fn * agent state restoration * change history tracking for codeact * fix responsiveness of init * fix lint * lint * delint * fix prop * update tests * logspam * lint * fix test * revert codeact * change fileService to use API * fix up session loading * delint * delint * fix integration tests * revert test * fix up access to options endpoints * fix initial files load * delint * fix file initialization * fix mock server * fixl int * fix auth for html * Update frontend/src/i18n/translation.json Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> * refactor sessions and sockets * avoid reinitializing the same session * fix reconnect issue * change up intro message * more guards on reinit * rename agent_session * delint * fix a bunch of tests * delint * fix last test * remove code editor context * fix build * fix any * fix dot notation * Update frontend/src/services/api.ts Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * fix up error handling * Update opendevin/server/session/agent.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update opendevin/server/session/agent.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update frontend/src/services/session.ts Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * fix build errs * fix else * add closed state * delint * Update opendevin/server/session/session.py Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> --------- Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> * fix #1960 (#1964) * Add ruff for shared mutable defaults (B) (#1938) * Add ruff for shared mutable defaults (B) * Apply B006, B008 on current files, except fast API * Update agenthub/SWE_agent/prompts.py Co-authored-by: Graham Neubig <neubig@gmail.com> * fix unintended behavior change * this is correct, tell Ruff to leave it alone --------- Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Refactor integration testing CI, add optional Mac tests, and mark a few agents as deprecated (#1888) * Add MacOS to integration tests * Switch back to python 3.11 * Install Docker for macos pipeline * regenerate.sh: Use environmental variable for sandbox type * Pack different agents' tests into a single check * Fix CodeAct tests * Reduce file match and extensive debug logs * Add TEST_IN_CI mode that reports codecov * Small fix: don't quit if reusing old responses failed * Merge codecov results * Fix typos * Remove coverage merge step - codecov automatically does that * Make mac integration tests as optional - too slow * Fix codecov args * Add comments in yaml * Include sandbox type in codecov report name * Fix codecov report merge * Revert renaming of test_matrix_success * Remove SWEAgent and PlannerAgent from tests * Mark planner agent and SWE agent as deprecated * CodeCov: Ignore planner and sweagent * Revert "Remove SWEAgent and PlannerAgent from tests" This reverts commit040cb3bfb9. * Remove all tests for SWE Agent * Only keep basic tests for MonologueAgent and PlannerAgent * Mark SWE Agent as deprecated, and ignore code coverage for it --------- Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> * Fix Repeated Responses in Chat by Adding IPythonRunCellObservation (#1987) Co-authored-by: jianghongwei <jianghongwei@58.com> Co-authored-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com> * Save CI cycles for backend tests (#1985) * Fix typo in prompt (#1992) * Refactor monologue and SWE agent to use the messages in state history (#1863) * Refactor monologue to use the messages in state history * add messages, clean up * fix monologue * update integration tests * move private method * update SWE agent to use the history from State * integration tests for SWE agent * rename monologue to initial_thoughts, since that is what it is * fix: catch session file not existed exception when init EventStream(maybe creating a new session with no session files stored). (#1994) * add ml-bench in readme * Bump boto3 from 1.34.110 to 1.34.111 (#2001) Bumps [boto3](https://github.com/boto/boto3) from 1.34.110 to 1.34.111. - [Release notes](https://github.com/boto/boto3/releases) - [Changelog](https://github.com/boto/boto3/blob/develop/CHANGELOG.rst) - [Commits](https://github.com/boto/boto3/compare/1.34.110...1.34.111) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump docker from 7.0.0 to 7.1.0 (#2002) Bumps [docker](https://github.com/docker/docker-py) from 7.0.0 to 7.1.0. - [Release notes](https://github.com/docker/docker-py/releases) - [Commits](https://github.com/docker/docker-py/compare/7.0.0...7.1.0) --- updated-dependencies: - dependency-name: docker dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump litellm from 1.37.20 to 1.38.0 (#2005) Bumps [litellm](https://github.com/BerriAI/litellm) from 1.37.20 to 1.38.0. - [Release notes](https://github.com/BerriAI/litellm/releases) - [Commits](https://github.com/BerriAI/litellm/compare/v1.37.20...v1.38.0) --- updated-dependencies: - dependency-name: litellm dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Fix SWE-Bench evaluation due to setuptools version (#1995) * correctly setup plugins for swebench eval * bump swe-bench version and add logging * Revert "correctly setup plugins for swebench eval" This reverts commit2bd1055673. * bump version * fix session state after resuming (#1999) * fix state resuming * fix session reconnection * fix lint * Implement `agentskills` for OpenDevin to helpfully improve edit AND including more useful tools/skills (#1941) * add draft for skills * Implement and test agentskills functions: open_file, goto_line, scroll_down, scroll_up, create_file, search_dir, search_file, find_file * Remove new_sample.txt file * add some work from opendevin w/ fixes * Add unit tests for agentskills module * fix some issues and updated tests * add more tests for open * tweak and handle goto_line * add tests for some edge cases * add tests for scrolling * add tests for edit * add tests for search_dir * update tests to use pytest * use pytest --forked to avoid file op unit tests to interfere with each other via global var * update doc based on swe agent tool * update and add tests for find_file and search_file * move agent_skills to plugins * add agentskills as plugin and docs * add agentskill to ssh box and fix sandbox integration * remove extra returns in doc * add agentskills to initial tool for jupyter * support re-init jupyter kernel (for agentskills) after restart * fix print window's issue with indentation and add testcases * add prompt for codeact with the newest edit primitives * modify the way line number is presented (remove leading space) * change prompt to the newest display format * support tracking of costs via metrics * Update opendevin/runtime/plugins/agent_skills/README.md * Update opendevin/runtime/plugins/agent_skills/README.md * implement and add tests for py linting * remove extra text arg for incompatible subprocess ver * remove sample.txt * update test_edits integration tests * fix all integration * Update opendevin/runtime/plugins/agent_skills/README.md * Update opendevin/runtime/plugins/agent_skills/README.md * Update opendevin/runtime/plugins/agent_skills/README.md * Update agenthub/codeact_agent/prompt.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update agenthub/codeact_agent/prompt.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update agenthub/codeact_agent/prompt.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update opendevin/runtime/plugins/agent_skills/agentskills.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * correctly setup plugins for swebench eval * bump swe-bench version and add logging * correctly setup plugins for swebench eval * bump swe-bench version and add logging * Revert "correctly setup plugins for swebench eval" This reverts commit2bd1055673. * bump version * remove _AGENT_SKILLS_DOCS * move flake8 to test dep * update poetry.lock * remove extra arg * reduce max iter for eval * update poetry * fix integration tests --------- Co-authored-by: OpenDevin <opendevin@opendevin.ai> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * build: Add poetry command to use Python 3.11 for environment setup (#1972) * Bump @react-types/shared from 3.23.0 to 3.23.1 in /frontend (#2006) Bumps [@react-types/shared](https://github.com/adobe/react-spectrum) from 3.23.0 to 3.23.1. - [Release notes](https://github.com/adobe/react-spectrum/releases) - [Commits](https://github.com/adobe/react-spectrum/compare/@react-types/shared@3.23.0...@react-types/shared@3.23.1) --- updated-dependencies: - dependency-name: "@react-types/shared" dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump @types/react-syntax-highlighter in /frontend (#2007) Bumps [@types/react-syntax-highlighter](https://github.com/DefinitelyTyped/DefinitelyTyped/tree/HEAD/types/react-syntax-highlighter) from 15.5.11 to 15.5.13. - [Release notes](https://github.com/DefinitelyTyped/DefinitelyTyped/releases) - [Commits](https://github.com/DefinitelyTyped/DefinitelyTyped/commits/HEAD/types/react-syntax-highlighter) --- updated-dependencies: - dependency-name: "@types/react-syntax-highlighter" dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump @typescript-eslint/parser from 7.9.0 to 7.10.0 in /frontend (#2008) Bumps [@typescript-eslint/parser](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/parser) from 7.9.0 to 7.10.0. - [Release notes](https://github.com/typescript-eslint/typescript-eslint/releases) - [Changelog](https://github.com/typescript-eslint/typescript-eslint/blob/main/packages/parser/CHANGELOG.md) - [Commits](https://github.com/typescript-eslint/typescript-eslint/commits/v7.10.0/packages/parser) --- updated-dependencies: - dependency-name: "@typescript-eslint/parser" dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump lint-staged from 15.2.2 to 15.2.4 in /frontend (#2009) Bumps [lint-staged](https://github.com/okonet/lint-staged) from 15.2.2 to 15.2.4. - [Release notes](https://github.com/okonet/lint-staged/releases) - [Changelog](https://github.com/lint-staged/lint-staged/blob/master/CHANGELOG.md) - [Commits](https://github.com/okonet/lint-staged/compare/v15.2.2...v15.2.4) --- updated-dependencies: - dependency-name: lint-staged dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Update README.md * Update README.md * add run_infer.sh * fix input output * fix docker sandbox * fix run * update and clean run_infer.py * add script to clean up dockers * update repo uid * add description * new * Update README.md * use root for sandbox * update readme * update ml-bench conda env * update readme * update readme * use try except * modify raise exception * add int * update README * longer time * fix existing issues * fix existing issue * new docker image * add metrics of cost * add result parsing cost * fix --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-31-157.ec2.internal> Co-authored-by: RainRat <rainrat78@yahoo.ca> Co-authored-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com> Co-authored-by: Frank Xu <frankxu2004@gmail.com> Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: Shimada666 <649940882@qq.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: Rahul Anand <62982824+zeul22@users.noreply.github.com> Co-authored-by: jiangleo <jiangleo@users.noreply.github.com> Co-authored-by: jianghongwei <jianghongwei@58.com> Co-authored-by: Jeremi Joslin <jeremi@newlogic.com> Co-authored-by: Aaron Xia <zhhuaxia@gmail.com> Co-authored-by: OpenDevin <opendevin@opendevin.ai> Co-authored-by: DaxServer <7479937+DaxServer@users.noreply.github.com> Co-authored-by: Robert <871607149@qq.com>
388 lines
14 KiB
Python
388 lines
14 KiB
Python
"""
|
|
Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
|
|
Large Language Models (LLMs) in leveraging existing functions in open-source libraries for
|
|
machine learning tasks. The benchmark is introduced in the paper "ML-Bench: Evaluating Large
|
|
Language Models for Code Generation in Repository-Level Machine Learning Tasks"
|
|
(https://arxiv.org/abs/2311.09835).
|
|
|
|
Please see https://ghcr.io/super-dainiu/ml_bench and https://huggingface.co/datasets/super-dainiu/ml-bench
|
|
for more details on the dataset and docker image used in this evaluation script.
|
|
|
|
TODOs:
|
|
- Support additional evaluation settings, such as providing raw README content or using a
|
|
retriever to extract relevant segments.
|
|
- Clean up the code and docker image used for evaluation.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import multiprocessing as mp
|
|
import os
|
|
import pathlib
|
|
import subprocess
|
|
import time
|
|
from concurrent.futures import ProcessPoolExecutor
|
|
|
|
from datasets import load_dataset
|
|
from tqdm import tqdm
|
|
|
|
from opendevin.controller.state.state import State
|
|
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
|
from opendevin.core.logger import get_console_handler
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
from opendevin.core.main import main
|
|
from opendevin.events.action import MessageAction
|
|
from opendevin.events.serialization.event import event_to_dict
|
|
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
|
|
|
|
|
def cleanup():
|
|
logger.info('Cleaning up child processes...')
|
|
for process in mp.active_children():
|
|
logger.info(f'Terminating child process: {process.name}')
|
|
process.terminate()
|
|
process.join()
|
|
|
|
|
|
def codeact_user_response(state: State) -> str:
|
|
msg = (
|
|
'Please continue working on the task on whatever approach you think is suitable.\n'
|
|
'If you think you have completed the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
|
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
|
)
|
|
if state.history:
|
|
user_msgs = [
|
|
action
|
|
for action, _ in state.history
|
|
if isinstance(action, MessageAction) and action.source == 'user'
|
|
]
|
|
if len(user_msgs) >= 2:
|
|
# let the agent know that it can give up when it has tried 3 times
|
|
return (
|
|
msg
|
|
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
|
)
|
|
return msg
|
|
|
|
|
|
def monologue_user_response(state: State) -> str:
|
|
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
|
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': codeact_user_response,
|
|
'MonologueAgent': monologue_user_response,
|
|
}
|
|
|
|
AGENT_CLS_TO_INST_SUFFIX = {
|
|
'CodeActAgent': 'When you think you have completed the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
|
}
|
|
|
|
ID2CONDA = {
|
|
1: 'dgl_DS',
|
|
2: 'bert_DS',
|
|
3: 'lavis_DS',
|
|
4: 'if_DS',
|
|
5: 'V2V_DS',
|
|
6: 'esm_DS',
|
|
7: 'OP_DS',
|
|
8: 'TSL_DS',
|
|
9: 'EAP_DS',
|
|
10: 'PG_DS',
|
|
11: 'PIM_DS',
|
|
12: 'AD2_DS',
|
|
13: 'L3_DS',
|
|
14: 'MZ2_DS',
|
|
15: 'GSA2_DS',
|
|
}
|
|
|
|
|
|
def process_instance(
|
|
instance, agent_class, metadata, eval_output_dir, reset_logger: bool = True
|
|
):
|
|
old_workspace_mount_path = config.workspace_mount_path
|
|
old_workspace_base = config.workspace_base
|
|
try:
|
|
workspace_mount_path = os.path.join(
|
|
config.workspace_mount_path, '_eval_workspace'
|
|
)
|
|
# create process-specific workspace dir
|
|
# so that different agent don't interfere with each other.
|
|
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
|
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
# reset workspace to config
|
|
config.workspace_base = workspace_mount_path
|
|
config.workspace_mount_path = workspace_mount_path
|
|
|
|
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
|
if reset_logger:
|
|
# Set up logger
|
|
log_file = os.path.join(
|
|
eval_output_dir,
|
|
'logs',
|
|
f"instance_{instance['id']}_pid_{os.getpid()}.log",
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
# add back the console handler to print ONE line
|
|
logger.addHandler(get_console_handler())
|
|
logger.info(
|
|
f"Starting evaluation for instance {instance['id']}.\nLOG: tail -f {log_file}"
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
file_handler = logging.FileHandler(log_file)
|
|
file_handler.setFormatter(
|
|
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
)
|
|
logger.addHandler(file_handler)
|
|
|
|
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
|
|
|
# Create a sandbox, using the instance ID as the session ID to avoid conflicts
|
|
sandbox = DockerSSHBox(sid=str(instance['id']) + '_' + str(os.getpid()))
|
|
|
|
# Set up the task environment
|
|
sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')
|
|
|
|
# Clone the task repo into the sandbox
|
|
repo_url = instance['github']
|
|
repo_name = repo_url.split('/')[-1]
|
|
sandbox.execute(f'git clone {repo_url} /workspace/{repo_name}')
|
|
sandbox.execute(f'chmod -R 777 /workspace/{repo_name}')
|
|
|
|
# Navigate to the task's code path
|
|
task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
|
|
sandbox.execute(f'cd {task_path}')
|
|
|
|
# Prepare the task instruction
|
|
instruction = (
|
|
f'Please complete the Machine Learning task in the following repository: {repo_name}\n\n'
|
|
f'The task is: {instance["task"]}\n\n'
|
|
f'{instance["instruction"]}\n\n'
|
|
'You should create a script named `run.sh` under the specified path in the repo to run the task.\n\n'
|
|
f'You can find the task repo at: {task_path}\n\n'
|
|
+ (
|
|
'Here is the prefix code for the task:\n'
|
|
'```bash\n'
|
|
f'{instance["prefix_code"]}\n'
|
|
'```\n\n'
|
|
if instance['prefix_code']
|
|
else ''
|
|
)
|
|
+ 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
|
|
)
|
|
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
|
|
|
# Run the agent
|
|
state: State = asyncio.run(
|
|
main(
|
|
instruction,
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
|
agent_class
|
|
),
|
|
sandbox=sandbox,
|
|
)
|
|
)
|
|
metrics = state.metrics.get() if state.metrics else {}
|
|
|
|
# Evaluate the agent's script
|
|
eval_script = os.path.join(task_path, 'run.sh')
|
|
logger.info(f'Running evaluation script: {eval_script}')
|
|
|
|
try:
|
|
_, eval_script_content = sandbox.execute(f'cat {eval_script}')
|
|
except Exception as e:
|
|
logger.error(f'Error reading evaluation script: {e}')
|
|
eval_script_content = ''
|
|
|
|
try:
|
|
exit_code, eval_output = sandbox.execute(
|
|
f'timeout 120s conda run -n {ID2CONDA[instance["github_id"]]} bash {eval_script}',
|
|
timeout=600,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f'Error running evaluation script: {e}')
|
|
exit_code = -1
|
|
eval_output = ''
|
|
|
|
if exit_code != 0 and exit_code != 124:
|
|
logger.warning(f'Evaluation script failed with exit code {exit_code}')
|
|
logger.warning(f'Output: {eval_output}')
|
|
metrics['success'] = int(
|
|
'KeyboardInterrupt' in eval_output
|
|
) # super-dainiu: assume ``KeyboardInterrupt`` is a success as is done in ML-Bench
|
|
else:
|
|
logger.info(f'Evaluation script succeeded with exit code {exit_code}')
|
|
logger.info(f'Output: {eval_output}')
|
|
metrics['success'] = 1
|
|
|
|
# Save the output
|
|
output = {
|
|
'instance_id': instance['id'],
|
|
'repo': repo_url,
|
|
'instruction': instruction,
|
|
'metadata': metadata,
|
|
'history': [
|
|
(event_to_dict(action), event_to_dict(obs))
|
|
for action, obs in state.history
|
|
],
|
|
'eval_script': eval_script_content,
|
|
'eval_exit_code': exit_code,
|
|
'eval_output': eval_output,
|
|
'metrics': metrics,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f'Error processing instance {instance["id"]}: {e}')
|
|
raise
|
|
finally:
|
|
config.workspace_mount_path = old_workspace_mount_path
|
|
config.workspace_base = old_workspace_base
|
|
|
|
# Shutdown the sandbox
|
|
sandbox.close()
|
|
return output
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_parser()
|
|
parser.add_argument(
|
|
'-s',
|
|
'--eval-split',
|
|
type=str,
|
|
default='quarter',
|
|
choices=['full', 'quarter'],
|
|
help='data split to evaluate on, either full or quarter',
|
|
)
|
|
args, _ = parser.parse_known_args()
|
|
|
|
data_split = args.eval_split
|
|
agent_class = args.agent_cls
|
|
num_workers = args.eval_num_workers
|
|
|
|
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
|
# for details of how to set `llm_config`
|
|
if args.llm_config:
|
|
specified_llm_config = get_llm_config_arg(args.llm_config)
|
|
if specified_llm_config:
|
|
config.llm = specified_llm_config
|
|
logger.info(f'Config for evaluation: {config}')
|
|
|
|
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
|
# so we don't need to manage file uploading to OpenDevin's repo
|
|
ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
|
|
|
|
# LIMIT EVALUATION
|
|
eval_n_limit = args.eval_n_limit
|
|
if eval_n_limit:
|
|
ml_bench = ml_bench.head(eval_n_limit)
|
|
logger.info(f'Limiting evaluation to {eval_n_limit} instances.')
|
|
|
|
# TEST METADATA
|
|
model_name = config.llm.model.split('/')[-1]
|
|
max_iterations = args.max_iterations
|
|
eval_note = ''
|
|
if args.eval_note is not None:
|
|
eval_note += '_N_' + args.eval_note
|
|
eval_output_dir = os.path.join(
|
|
args.eval_output_dir,
|
|
'ml_bench',
|
|
agent_class,
|
|
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
|
)
|
|
os.makedirs(eval_output_dir, exist_ok=True)
|
|
os.makedirs(os.path.join(eval_output_dir, 'logs'), exist_ok=True)
|
|
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
|
|
|
metadata = {
|
|
'agent_class': agent_class,
|
|
'model_name': model_name,
|
|
'max_iterations': max_iterations,
|
|
'eval_output_dir': eval_output_dir,
|
|
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
# get the commit id of current repo for reproduciblity
|
|
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
|
.decode('utf-8')
|
|
.strip(),
|
|
}
|
|
logger.info(f'Metadata: {metadata}')
|
|
|
|
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
|
logger.info(f'Evaluating on data split: {data_split}')
|
|
logger.info(f'Using {num_workers} worker processes')
|
|
logger.info(f'Writing evaluation output to {output_file}')
|
|
|
|
finished_instance_ids = set()
|
|
if os.path.exists(output_file):
|
|
with open(output_file, 'r') as f:
|
|
for line in f:
|
|
try:
|
|
data = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
print(f'Error parsing line: {line}')
|
|
finished_instance_ids.add(data['instance_id'])
|
|
logger.warning(
|
|
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
|
)
|
|
output_fp = open(output_file, 'a')
|
|
|
|
logger.info(
|
|
f'Evaluation started with Agent {agent_class}, model {model_name}, data split {data_split}.'
|
|
)
|
|
|
|
# Filter out finished instances
|
|
new_instances = [
|
|
instance
|
|
for _, instance in ml_bench.iterrows()
|
|
if instance['id'] not in finished_instance_ids
|
|
]
|
|
logger.info(
|
|
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(new_instances)}'
|
|
)
|
|
|
|
pbar = tqdm(total=len(new_instances))
|
|
|
|
# This function tracks the progress AND writes the output to a JSONL file
|
|
def update_progress(future):
|
|
pbar.update(1)
|
|
output = future.result()
|
|
pbar.set_description(f'Instance {output["instance_id"]}')
|
|
pbar.set_postfix_str(f'Metrics: {output["metrics"]}')
|
|
logger.info(
|
|
f'Finished evaluation for instance {output["instance_id"]}: {output["metrics"]}'
|
|
)
|
|
output_fp.write(json.dumps(output) + '\n')
|
|
output_fp.flush()
|
|
|
|
# This sets the multi-processing
|
|
num_workers = args.eval_num_workers
|
|
logger.info(f'Using {num_workers} workers for evaluation.')
|
|
|
|
try:
|
|
with ProcessPoolExecutor(num_workers) as executor:
|
|
futures = []
|
|
for _, instance in ml_bench.iterrows():
|
|
future = executor.submit(
|
|
process_instance,
|
|
instance,
|
|
agent_class,
|
|
metadata,
|
|
eval_output_dir,
|
|
reset_logger=bool(num_workers > 1),
|
|
)
|
|
future.add_done_callback(update_progress)
|
|
futures.append(future)
|
|
|
|
for future in futures:
|
|
output = future.result()
|
|
except KeyboardInterrupt:
|
|
print('KeyboardInterrupt received. Cleaning up...')
|
|
cleanup()
|
|
|
|
logger.info('Evaluation completed.')
|