diff --git a/evaluation/README.md b/evaluation/README.md
index ffe07d577d..496b313ffb 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -13,6 +13,7 @@ all the preprocessing/evaluation/analysis scripts.
## Supported Benchmarks
- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
+- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
- GAIA: [`evaluation/gaia`](./gaia)
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
diff --git a/evaluation/ml_bench/README.md b/evaluation/ml_bench/README.md
new file mode 100644
index 0000000000..78f227d9ec
--- /dev/null
+++ b/evaluation/ml_bench/README.md
@@ -0,0 +1,126 @@
+# ML-Bench Evaluation with OpenDevin
+
+This project implements the evaluation of agents on the [ML-Bench](https://arxiv.org/abs/2311.09835) dataset using OpenDevin. [ML-Bench](https://arxiv.org/abs/2311.09835) is a comprehensive benchmark designed to assess the effectiveness of Large Language Models (LLMs) in leveraging existing functions in open-source libraries for machine learning tasks. The benchmark consists of 10,040 samples spanning 130 tasks over 14 notable machine learning GitHub repositories.
+
+## Task Overview
+
+The ML-Bench task presents a scenario where, given a GitHub repository, the language model has access to all files within the repository. Upon receiving a user instruction with a set of specific parameters, the Agent is required to write code that invokes models or functions from the GitHub repository. The generated code must align with the user's instruction, particularly in terms of the specified parameters, and must be executable.
+
+The task introduces new challenges for LLMs, such as comprehending long and language-code interleaved documents, understanding complex cross-file code structures, and effectively navigating the codebase to locate relevant information. ML-Bench serves as a critical tool for assessing the efficiency and adaptability of various methods in real-world scenarios.
+
+For more details on the ML-Bench task and dataset, please refer to the paper: [ML-Bench: Evaluating Large Language Models for Code Generation in Repository-Level Machine Learning Tasks](https://arxiv.org/abs/2311.09835).
+
+## Setup Environment
+
+Please follow the [OpenDevin setup guide](https://github.com/OpenDevin/OpenDevin/blob/main/docs/setup.md) to set up the local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+ssh_hostname = "localhost"
+enable_auto_lint = true
+run_as_devin = false
+sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference on ML-Bench
+
+To run the evaluation on the ML-Bench dataset, use the following command:
+
+```bash
+./evaluation/ml_bench/scripts/run_infer.sh [model_config] [split] [agent] [eval_limit]
+# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview full CodeActAgent 10
+```
+
+You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
+
+## Examples
+
+For each task in the ML-Bench dataset, OpenDevin provides the agent with a set number of iterations to complete the task. The `history` field in the evaluation output shows each iteration's response and actions taken by the agent to complete the task.
+
+Here's an example of the evaluation output for a single task instance:
+
+```json
+{
+ "instance_id": 3,
+ "repo": "https://github.com/dmlc/dgl",
+ "instruction": "Please complete the Machine Learning task in the following repository: dgl\n\nThe task is: DGL Implementation of NGCF model\n\nI have a deep desire to embark on a journey brimming with knowledge and expertise. My objective is to train a cutting-edge NGCF Model, known for its unparalleled capabilities, on the illustrious dataset known as gowalla. To ensure swift execution, I kindly request your assistance in crafting the code, making use of the powerful GPU #3 and an embedding size of 32. Can you lend a helping hand to transform this dream into a reality?\n\nYou should create a script named `run.sh` under the specified path in the repo to run the task.\n\nYou can find the task repo at: /workspace/dgl/examples/pytorch/NGCF/NGCF\n\nYou should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).When you think you have completed the task, please run the following command: exit .\n",
+ "metadata": {
+ "agent_class": "CodeActAgent",
+ "model_name": "gpt-4-1106-preview",
+ "max_iterations": 10,
+ "eval_output_dir": "evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5",
+ "start_time": "2024-05-26 17:39:59",
+ "git_commit": "dd8ee9044a94a213dc2e31d2085dbf2924ee80a1"
+ },
+ "history": [
+ [
+ {
+ "id": 0,
+ "timestamp": "2024-05-26T17:40:41.060009",
+ "source": "user",
+ "message": "Please complete the Machine Learning task in the following repository: dgl\n\nThe task is: DGL Implementation of NGCF model\n\nI have a deep desire to embark on a journey brimming with knowledge and expertise. My objective is to train a cutting-edge NGCF Model, known for its unparalleled capabilities, on the illustrious dataset known as gowalla. To ensure swift execution, I kindly request your assistance in crafting the code, making use of the powerful GPU #3 and an embedding size of 32. Can you lend a helping hand to transform this dream into a reality?\n\nYou should create a script named `run.sh` under the specified path in the repo to run the task.\n\nYou can find the task repo at: /workspace/dgl/examples/pytorch/NGCF/NGCF\n\nYou should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).When you think you have completed the task, please run the following command: exit .\n",
+ "action": "message",
+ "args": {
+ "content": "Please complete the Machine Learning task in the following repository: dgl\n\nThe task is: DGL Implementation of NGCF model\n\nI have a deep desire to embark on a journey brimming with knowledge and expertise. My objective is to train a cutting-edge NGCF Model, known for its unparalleled capabilities, on the illustrious dataset known as gowalla. To ensure swift execution, I kindly request your assistance in crafting the code, making use of the powerful GPU #3 and an embedding size of 32. Can you lend a helping hand to transform this dream into a reality?\n\nYou should create a script named `run.sh` under the specified path in the repo to run the task.\n\nYou can find the task repo at: /workspace/dgl/examples/pytorch/NGCF/NGCF\n\nYou should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).When you think you have completed the task, please run the following command: exit .\n",
+ "wait_for_response": false
+ }
+ },
+ {
+ "message": "No observation",
+ "observation": "null",
+ "content": "",
+ "extras": {}
+ }
+ ],
+ // ... more iterations
+ ],
+ "eval_exit_code": 124, // ML-Bench believes the agent is successful if it continues to run until timeout
+ "eval_output": "",
+ "eval_script": "pip install Matplotlib==2.2.2\r\n"
+ "cd /workspace/dgl/examples/pytorch/dgmg\r\n"
+ "python main.py",
+ "metrics": {
+ "success": 1
+ }
+}
+```
+
+The `history` field contains the agent's actions and observations at each iteration, including the commands executed, file edits, and the agent's thoughts.
+
+The `eval_exit_code` and `eval_output` fields provide information about the execution of the evaluation command and its output.
+
+The `metrics` field contains the parsed evaluation metrics from the `eval_output`.
+
+## Customization
+
+You can customize the evaluation script by modifying the `evaluation/ml_bench/run_infer.py` file. This script handles loading the ML-Bench dataset, running the agent on each task instance, and saving the evaluation outputs.
+
+Feel free to adjust the configuration, logging, and output formatting to suit your needs.
+
+## Contributing
+
+If you encounter any issues or have suggestions for improvements, please open an issue or submit a pull request on the [GitHub repository](https://github.com/gersteinlab/ML-bench).
+
+## License
+
+This project is licensed under the [MIT License](LICENSE).
diff --git a/evaluation/ml_bench/__init__.py b/evaluation/ml_bench/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
new file mode 100644
index 0000000000..b861227969
--- /dev/null
+++ b/evaluation/ml_bench/run_infer.py
@@ -0,0 +1,387 @@
+"""
+Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
+Large Language Models (LLMs) in leveraging existing functions in open-source libraries for
+machine learning tasks. The benchmark is introduced in the paper "ML-Bench: Evaluating Large
+Language Models for Code Generation in Repository-Level Machine Learning Tasks"
+(https://arxiv.org/abs/2311.09835).
+
+Please see https://ghcr.io/super-dainiu/ml_bench and https://huggingface.co/datasets/super-dainiu/ml-bench
+for more details on the dataset and docker image used in this evaluation script.
+
+TODOs:
+- Support additional evaluation settings, such as providing raw README content or using a
+ retriever to extract relevant segments.
+- Clean up the code and docker image used for evaluation.
+"""
+
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+
+
+def cleanup():
+ logger.info('Cleaning up child processes...')
+ for process in mp.active_children():
+ logger.info(f'Terminating child process: {process.name}')
+ process.terminate()
+ process.join()
+
+
+def codeact_user_response(state: State) -> str:
+ msg = (
+ 'Please continue working on the task on whatever approach you think is suitable.\n'
+ 'If you think you have completed the task, please run the following command: exit .\n'
+ 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
+ )
+ if state.history:
+ user_msgs = [
+ action
+ for action, _ in state.history
+ if isinstance(action, MessageAction) and action.source == 'user'
+ ]
+ if len(user_msgs) >= 2:
+ # let the agent know that it can give up when it has tried 3 times
+ return (
+ msg
+ + 'If you want to give up, run: exit .\n'
+ )
+ return msg
+
+
+def monologue_user_response(state: State) -> str:
+ raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+ 'CodeActAgent': codeact_user_response,
+ 'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+ 'CodeActAgent': 'When you think you have completed the task, please run the following command: exit .\n'
+}
+
+ID2CONDA = {
+ 1: 'dgl_DS',
+ 2: 'bert_DS',
+ 3: 'lavis_DS',
+ 4: 'if_DS',
+ 5: 'V2V_DS',
+ 6: 'esm_DS',
+ 7: 'OP_DS',
+ 8: 'TSL_DS',
+ 9: 'EAP_DS',
+ 10: 'PG_DS',
+ 11: 'PIM_DS',
+ 12: 'AD2_DS',
+ 13: 'L3_DS',
+ 14: 'MZ2_DS',
+ 15: 'GSA2_DS',
+}
+
+
+def process_instance(
+ instance, agent_class, metadata, eval_output_dir, reset_logger: bool = True
+):
+ old_workspace_mount_path = config.workspace_mount_path
+ old_workspace_base = config.workspace_base
+ try:
+ workspace_mount_path = os.path.join(
+ config.workspace_mount_path, '_eval_workspace'
+ )
+ # create process-specific workspace dir
+ # so that different agent don't interfere with each other.
+ workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
+ pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
+
+ # reset workspace to config
+ config.workspace_base = workspace_mount_path
+ config.workspace_mount_path = workspace_mount_path
+
+ # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+ if reset_logger:
+ # Set up logger
+ log_file = os.path.join(
+ eval_output_dir,
+ 'logs',
+ f"instance_{instance['id']}_pid_{os.getpid()}.log",
+ )
+ # Remove all existing handlers from logger
+ for handler in logger.handlers[:]:
+ logger.removeHandler(handler)
+ # add back the console handler to print ONE line
+ logger.addHandler(get_console_handler())
+ logger.info(
+ f"Starting evaluation for instance {instance['id']}.\nLOG: tail -f {log_file}"
+ )
+ # Remove all existing handlers from logger
+ for handler in logger.handlers[:]:
+ logger.removeHandler(handler)
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setFormatter(
+ logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+ )
+ logger.addHandler(file_handler)
+
+ logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
+
+ # Create a sandbox, using the instance ID as the session ID to avoid conflicts
+ sandbox = DockerSSHBox(sid=str(instance['id']) + '_' + str(os.getpid()))
+
+ # Set up the task environment
+ sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')
+
+ # Clone the task repo into the sandbox
+ repo_url = instance['github']
+ repo_name = repo_url.split('/')[-1]
+ sandbox.execute(f'git clone {repo_url} /workspace/{repo_name}')
+ sandbox.execute(f'chmod -R 777 /workspace/{repo_name}')
+
+ # Navigate to the task's code path
+ task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
+ sandbox.execute(f'cd {task_path}')
+
+ # Prepare the task instruction
+ instruction = (
+ f'Please complete the Machine Learning task in the following repository: {repo_name}\n\n'
+ f'The task is: {instance["task"]}\n\n'
+ f'{instance["instruction"]}\n\n'
+ 'You should create a script named `run.sh` under the specified path in the repo to run the task.\n\n'
+ f'You can find the task repo at: {task_path}\n\n'
+ + (
+ 'Here is the prefix code for the task:\n'
+ '```bash\n'
+ f'{instance["prefix_code"]}\n'
+ '```\n\n'
+ if instance['prefix_code']
+ else ''
+ )
+ + 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
+ )
+ instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+ # Run the agent
+ state: State = asyncio.run(
+ main(
+ instruction,
+ fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
+ agent_class
+ ),
+ sandbox=sandbox,
+ )
+ )
+ metrics = state.metrics.get() if state.metrics else {}
+
+ # Evaluate the agent's script
+ eval_script = os.path.join(task_path, 'run.sh')
+ logger.info(f'Running evaluation script: {eval_script}')
+
+ try:
+ _, eval_script_content = sandbox.execute(f'cat {eval_script}')
+ except Exception as e:
+ logger.error(f'Error reading evaluation script: {e}')
+ eval_script_content = ''
+
+ try:
+ exit_code, eval_output = sandbox.execute(
+ f'timeout 120s conda run -n {ID2CONDA[instance["github_id"]]} bash {eval_script}',
+ timeout=600,
+ )
+ except Exception as e:
+ logger.error(f'Error running evaluation script: {e}')
+ exit_code = -1
+ eval_output = ''
+
+ if exit_code != 0 and exit_code != 124:
+ logger.warning(f'Evaluation script failed with exit code {exit_code}')
+ logger.warning(f'Output: {eval_output}')
+ metrics['success'] = int(
+ 'KeyboardInterrupt' in eval_output
+ ) # super-dainiu: assume ``KeyboardInterrupt`` is a success as is done in ML-Bench
+ else:
+ logger.info(f'Evaluation script succeeded with exit code {exit_code}')
+ logger.info(f'Output: {eval_output}')
+ metrics['success'] = 1
+
+ # Save the output
+ output = {
+ 'instance_id': instance['id'],
+ 'repo': repo_url,
+ 'instruction': instruction,
+ 'metadata': metadata,
+ 'history': [
+ (event_to_dict(action), event_to_dict(obs))
+ for action, obs in state.history
+ ],
+ 'eval_script': eval_script_content,
+ 'eval_exit_code': exit_code,
+ 'eval_output': eval_output,
+ 'metrics': metrics,
+ }
+
+ except Exception as e:
+ logger.error(f'Error processing instance {instance["id"]}: {e}')
+ raise
+ finally:
+ config.workspace_mount_path = old_workspace_mount_path
+ config.workspace_base = old_workspace_base
+
+ # Shutdown the sandbox
+ sandbox.close()
+ return output
+
+
+if __name__ == '__main__':
+ parser = get_parser()
+ parser.add_argument(
+ '-s',
+ '--eval-split',
+ type=str,
+ default='quarter',
+ choices=['full', 'quarter'],
+ help='data split to evaluate on, either full or quarter',
+ )
+ args, _ = parser.parse_known_args()
+
+ data_split = args.eval_split
+ agent_class = args.agent_cls
+ num_workers = args.eval_num_workers
+
+ # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+ # for details of how to set `llm_config`
+ if args.llm_config:
+ specified_llm_config = get_llm_config_arg(args.llm_config)
+ if specified_llm_config:
+ config.llm = specified_llm_config
+ logger.info(f'Config for evaluation: {config}')
+
+ # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+ # so we don't need to manage file uploading to OpenDevin's repo
+ ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
+
+ # LIMIT EVALUATION
+ eval_n_limit = args.eval_n_limit
+ if eval_n_limit:
+ ml_bench = ml_bench.head(eval_n_limit)
+ logger.info(f'Limiting evaluation to {eval_n_limit} instances.')
+
+ # TEST METADATA
+ model_name = config.llm.model.split('/')[-1]
+ max_iterations = args.max_iterations
+ eval_note = ''
+ if args.eval_note is not None:
+ eval_note += '_N_' + args.eval_note
+ eval_output_dir = os.path.join(
+ args.eval_output_dir,
+ 'ml_bench',
+ agent_class,
+ model_name + '_maxiter_' + str(max_iterations) + eval_note,
+ )
+ os.makedirs(eval_output_dir, exist_ok=True)
+ os.makedirs(os.path.join(eval_output_dir, 'logs'), exist_ok=True)
+ logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+ metadata = {
+ 'agent_class': agent_class,
+ 'model_name': model_name,
+ 'max_iterations': max_iterations,
+ 'eval_output_dir': eval_output_dir,
+ 'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+ # get the commit id of current repo for reproduciblity
+ 'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+ .decode('utf-8')
+ .strip(),
+ }
+ logger.info(f'Metadata: {metadata}')
+
+ output_file = os.path.join(eval_output_dir, 'output.jsonl')
+ logger.info(f'Evaluating on data split: {data_split}')
+ logger.info(f'Using {num_workers} worker processes')
+ logger.info(f'Writing evaluation output to {output_file}')
+
+ finished_instance_ids = set()
+ if os.path.exists(output_file):
+ with open(output_file, 'r') as f:
+ for line in f:
+ try:
+ data = json.loads(line)
+ except json.JSONDecodeError:
+ print(f'Error parsing line: {line}')
+ finished_instance_ids.add(data['instance_id'])
+ logger.warning(
+ f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+ )
+ output_fp = open(output_file, 'a')
+
+ logger.info(
+ f'Evaluation started with Agent {agent_class}, model {model_name}, data split {data_split}.'
+ )
+
+ # Filter out finished instances
+ new_instances = [
+ instance
+ for _, instance in ml_bench.iterrows()
+ if instance['id'] not in finished_instance_ids
+ ]
+ logger.info(
+ f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(new_instances)}'
+ )
+
+ pbar = tqdm(total=len(new_instances))
+
+ # This function tracks the progress AND writes the output to a JSONL file
+ def update_progress(future):
+ pbar.update(1)
+ output = future.result()
+ pbar.set_description(f'Instance {output["instance_id"]}')
+ pbar.set_postfix_str(f'Metrics: {output["metrics"]}')
+ logger.info(
+ f'Finished evaluation for instance {output["instance_id"]}: {output["metrics"]}'
+ )
+ output_fp.write(json.dumps(output) + '\n')
+ output_fp.flush()
+
+ # This sets the multi-processing
+ num_workers = args.eval_num_workers
+ logger.info(f'Using {num_workers} workers for evaluation.')
+
+ try:
+ with ProcessPoolExecutor(num_workers) as executor:
+ futures = []
+ for _, instance in ml_bench.iterrows():
+ future = executor.submit(
+ process_instance,
+ instance,
+ agent_class,
+ metadata,
+ eval_output_dir,
+ reset_logger=bool(num_workers > 1),
+ )
+ future.add_done_callback(update_progress)
+ futures.append(future)
+
+ for future in futures:
+ output = future.result()
+ except KeyboardInterrupt:
+ print('KeyboardInterrupt received. Cleaning up...')
+ cleanup()
+
+ logger.info('Evaluation completed.')
diff --git a/evaluation/ml_bench/scripts/cleanup.sh b/evaluation/ml_bench/scripts/cleanup.sh
new file mode 100644
index 0000000000..c6c90f662c
--- /dev/null
+++ b/evaluation/ml_bench/scripts/cleanup.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Step 1: Stop all running containers
+echo "Stopping all running containers..."
+docker stop $(docker ps -q)
+
+# Step 2: Remove all containers (running and stopped)
+echo "Removing all containers..."
+docker rm $(docker ps -a -q)
+
+# Optional: Remove all Docker images (if you want to clean up images too)
+# echo "Removing all Docker images..."
+# docker rmi $(docker images -q)
+
+echo "All containers have been removed."
diff --git a/evaluation/ml_bench/scripts/run_infer.sh b/evaluation/ml_bench/scripts/run_infer.sh
new file mode 100755
index 0000000000..557acc8e18
--- /dev/null
+++ b/evaluation/ml_bench/scripts/run_infer.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+MODEL_CONFIG=$1
+SPLIT=$2
+AGENT=$3
+EVAL_LIMIT=$4
+
+if [ -z "$MODEL_CONFIG" ]; then
+ echo "Model config not specified, use default"
+ MODEL_CONFIG="eval_gpt4_1106_preview"
+fi
+
+if [ -z "$AGENT" ]; then
+ echo "Agent not specified, use default CodeActAgent"
+ AGENT="CodeActAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+COMMAND="poetry run python evaluation/ml_bench/run_infer.py \
+ --agent-cls $AGENT \
+ --llm-config $MODEL_CONFIG \
+ --max-iterations 10 \
+ --eval-num-workers 4 \
+ --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+ echo "EVAL_LIMIT: $EVAL_LIMIT"
+ COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+if [ -n "$SPLIT" ]; then
+ echo "SPLIT: $SPLIT"
+ COMMAND="$COMMAND --eval-split $SPLIT"
+fi
+
+# Run the command
+eval $COMMAND
diff --git a/evaluation/ml_bench/scripts/summarise_results.py b/evaluation/ml_bench/scripts/summarise_results.py
new file mode 100644
index 0000000000..5d7f4f495a
--- /dev/null
+++ b/evaluation/ml_bench/scripts/summarise_results.py
@@ -0,0 +1,57 @@
+import json
+import pprint
+import sys
+
+
+def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+ passed = []
+ failed = []
+ costs = []
+ with open(res_file_path, 'r') as file:
+ for line in file:
+ data = json.loads(line.strip())
+ success = data['metrics']['success']
+ if success:
+ passed.append(
+ {
+ 'instance_id': data['instance_id'],
+ 'repo': data['repo'],
+ 'instruction': data['instruction'],
+ 'eval_script': data['eval_script'],
+ 'eval_exit_code': data['eval_exit_code'],
+ 'eval_output': data['eval_output'],
+ 'accumulated_cost': data['metrics']['accumulated_cost'],
+ }
+ )
+ else:
+ failed.append(
+ {
+ 'instance_id': data['instance_id'],
+ 'repo': data['repo'],
+ 'instruction': data['instruction'],
+ 'eval_script': data['eval_script'],
+ 'eval_exit_code': data['eval_exit_code'],
+ 'eval_output': data['eval_output'],
+ 'accumulated_cost': data['metrics']['accumulated_cost'],
+ }
+ )
+ costs.append(data['metrics']['accumulated_cost'])
+ return passed, failed, costs
+
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print(
+ 'Usage: poetry run python summarise_results.py '
+ )
+ sys.exit(1)
+ json_file_path = sys.argv[1]
+ passed_tests, failed_tests, costs = extract_test_results(json_file_path)
+ success_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+ print('PASSED TESTS:')
+ pprint.pprint(passed_tests)
+ print('FAILED TESTS:')
+ pprint.pprint(failed_tests)
+ print(
+ f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, success rate = {success_rate}, average cost = {sum(costs) / len(costs)}'
+ )