diff --git a/evaluation/README.md b/evaluation/README.md index 047d09c4e8..8cd0704e66 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -16,6 +16,7 @@ all the preprocessing/evaluation/analysis scripts. - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix) - GAIA: [`evaluation/gaia`](./gaia) - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA) +- MINT: [`evaluation/mint`](./mint) ### Result Visualization diff --git a/evaluation/mint/.gitignore b/evaluation/mint/.gitignore new file mode 100644 index 0000000000..4fa1a31067 --- /dev/null +++ b/evaluation/mint/.gitignore @@ -0,0 +1 @@ +!requirements.txt diff --git a/evaluation/mint/README.md b/evaluation/mint/README.md new file mode 100644 index 0000000000..f436166239 --- /dev/null +++ b/evaluation/mint/README.md @@ -0,0 +1,45 @@ +# MINT Benchmark + +This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions. + +## Configure OpenDevin and LM + +Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up. + +## Start the evaluation + +We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/datasets/ryanhoangt/xingyaoww-mint-bench). + +Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`. + +```bash +./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit] +``` + +where `model_config` is mandatory, while `subset` and `eval_limit` are optional. + +- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`. + +- `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`. + +- `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances. + +Note: in order to use `eval_limit`, you must also set `subset`. + +Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`, +then your command would be: + +```bash +./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3 +``` +## Reference +``` +@misc{wang2024mint, + title={MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback}, + author={Xingyao Wang and Zihan Wang and Jiateng Liu and Yangyi Chen and Lifan Yuan and Hao Peng and Heng Ji}, + year={2024}, + eprint={2309.10691}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/evaluation/mint/config_variables.py b/evaluation/mint/config_variables.py new file mode 100644 index 0000000000..6e38dc9e29 --- /dev/null +++ b/evaluation/mint/config_variables.py @@ -0,0 +1,5 @@ +TASK_INFO_MAP = { + # === Reasoning === + 'gsm8k': {'class': 'ReasoningTask', 'type': 'reasoning'}, + 'math': {'class': 'ReasoningTask', 'type': 'reasoning'}, +} diff --git a/evaluation/mint/datatypes.py b/evaluation/mint/datatypes.py new file mode 100644 index 0000000000..7f22476c49 --- /dev/null +++ b/evaluation/mint/datatypes.py @@ -0,0 +1,82 @@ +import enum +from typing import Any, Dict, Tuple + + +class TaskState: + def __init__( + self, + finished: bool = False, + success: bool = False, + agent_action_count: dict = None, + terminate_reason: str = None, + latest_output: Dict[str, Any] = None, + ): + self.finished = finished + self.success = success + self.agent_action_count: Dict[str, int] = agent_action_count or { + 'propose_solution': 0, + 'use_tool': 0, + 'invalid_action': 0, + } + self.terminate_reason = terminate_reason + self.latest_output = latest_output + + def to_dict(self) -> Dict[str, Any]: + return { + 'finished': self.finished, + 'success': self.success, + 'agent_action_count': self.agent_action_count, + 'terminate_reason': self.terminate_reason, + 'latest_output': self.latest_output, + } + + +class ParseError(Exception): + pass + + +class FeedbackType(enum.Enum): + FEEDBACK_WITH_GT = 'feedback_with_gt' + FEEDBACK_WO_GT = 'feedback_wo_gt' + NO_FEEDBACK = 'no_feedback' + + +class StepOutput: + def __init__( + self, + observation: str = None, + success: bool = False, + extra: Dict[str, Any] = None, + turn_info: Tuple[int, int] = None, + ): + self.observation: str = observation + self.success: bool = success + self.extra: Dict[str, Any] = extra + self.turn_info = turn_info + + def __repr__(self) -> str: + return self.observation + + def to_str(self) -> str: + output = 'Observation:\n' + if self.observation is not None: + output += self.observation + '\n' + else: + if not self.success: + output += 'Your answer is wrong.\n' + + if self.turn_info is not None: + n_steps_left, n_propose_solution_left = self.turn_info + output += 'You have {} steps left and {} chances to propose solution left.\n'.format( + n_steps_left, n_propose_solution_left + ) + if n_steps_left <= 1: + output += 'You should take the last step to propose a solution.\n' + + return output + + def to_dict(self) -> Dict[str, Any]: + return { + 'observation': self.observation, + 'success': self.success, + } diff --git a/evaluation/mint/env.py b/evaluation/mint/env.py new file mode 100644 index 0000000000..76a8615f10 --- /dev/null +++ b/evaluation/mint/env.py @@ -0,0 +1,119 @@ +import re +import traceback +from typing import Dict, Optional + +from datatypes import ParseError, StepOutput, TaskState +from task import Task + +from opendevin.controller.state.state import State + + +class SimplifiedEnv: + INVALID_INPUT_MESSAGE = ( + "I don't understand your input. \n" + 'If you want to execute code, please use YOUR_CODE_HERE .\n' + 'If you want to give me an answer, please use YOUR_SOLUTION_HERE .\n' + 'For example: The answer to the question is 42 . \n' + ) + + def __init__(self, agent_state: State, task: Task, task_config: Dict[str, int]): + self.agent_state = agent_state + self.task = task + self.task_state = TaskState() + self.task_config = task_config + + def step(self, lm_message: str): + observation = self.handle_propose_solution(lm_message) + + self.check_max_iteration() + + turn_info = ( + self.task_config['max_iterations'] - self.agent_state.iteration, + self.task_config['max_propose_solution'] + - self.task_state.agent_action_count['propose_solution'], + ) + + output = StepOutput( + observation=observation, + success=self.task_state.success, + turn_info=turn_info, + ) + + self.log_output(output) + return self.task_state + + def handle_propose_solution(self, lm_message) -> Optional[str]: + """Propose answer to check the task success. + + It might set self.state.finished = True if the task is successful. + """ + self.task_state.agent_action_count['propose_solution'] += 1 + try: + parsed = self.parse_propose_solution(lm_message) + task_success = self.check_task_success(parsed['answer']) + if task_success: + self.task_state.finished = True + self.task_state.success = True + self.task_state.terminate_reason = 'task_success' + # NOTE: should not return the function now, because we need to log the output + # Set state.finished = True will terminate the episode + except ParseError: + return SimplifiedEnv.INVALID_INPUT_MESSAGE + except Exception: + error_traceback = traceback.format_exc() + return f'{error_traceback}' + + def parse_propose_solution(self, lm_message: str) -> dict: + """Define the parsing logic.""" + lm_output = '\n' + lm_message + '\n' + + answer = '\n'.join( + [ + i.strip() + for i in re.findall(r'(.*?)', lm_output, re.DOTALL) + ] + ) + if answer == '': + raise ParseError('No answer found.') + + return {'answer': answer} + + def log_output(self, output: StepOutput) -> None: + if self.task_state.finished: + return + + content = output.to_str() + # self.state.history.append({"role": "user", "content": content}) + self.task_state.latest_output = output.to_dict() + self.task_state.latest_output['content'] = content + + def check_task_success(self, answer: str) -> bool: + # log_message.info(f"STUDENT ANSWER: [{answer}]") + # log_message.info(f"REFERENCE ANSWER: [{self.task.reference}]") + return self.task.success(answer) + + def check_max_iteration(self): + """Check if the agent has reached the max iteration limit. + + It might set self.state.finished = True if the agent has reached the max iteration limit. + """ + if self.task_state.finished: + # ignore if the episode is already finished (e.g., task success) + return + + if ( + # propose solution > max output solution + self.task_state.agent_action_count['propose_solution'] + >= self.task_config['max_propose_solution'] + ): + self.task_state.finished = True + self.task_state.success = False + self.task_state.terminate_reason = 'max_propose_steps' + elif ( + # (propose_solution + use_tool) > max iteration limit + sum(self.task_state.agent_action_count.values()) + >= self.task_config['max_iterations'] + ): + self.task_state.finished = True + self.task_state.success = False + self.task_state.terminate_reason = 'max_iterations' diff --git a/evaluation/mint/in_context_examples/reasoning/with_tool.txt b/evaluation/mint/in_context_examples/reasoning/with_tool.txt new file mode 100644 index 0000000000..448e173936 --- /dev/null +++ b/evaluation/mint/in_context_examples/reasoning/with_tool.txt @@ -0,0 +1,48 @@ +Task: +Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai. +Output format requirement: Round to three decimal places, using half up rounding rule. + +Assistant: + To solve this problem, let's start by determining Li Bai's birth year. + print(wikipedia_search('Li Bai')) + +Observation: +Page: Li Bai +Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701–762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights. + +Assistant: + +From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701. +To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates. +In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$. +$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$ +Let's use the SymPy package to calculate this. + + +from sympy import symbols, integrate, cos, sin, pi + +r, theta = symbols('r theta') + +# the integrand +integrand = (r*cos(theta) + r*sin(theta))**2 * r + +# perform the double integration +double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi)) +print(double_integral) + + +Observation: +Out[2]: 245700.5*pi + +Assistant: + Let's round this result to three decimal places using half up rounding rule. + +import math +print(round(245700.5*math.pi, 3)) + + +Observation: +771890.886 + +Assistant: +The answer is 771890.886 . diff --git a/evaluation/mint/prompts/__init__.py b/evaluation/mint/prompts/__init__.py new file mode 100644 index 0000000000..e07c54e748 --- /dev/null +++ b/evaluation/mint/prompts/__init__.py @@ -0,0 +1,25 @@ +import os + +from utils import load_file + +PROMPT_DIR = os.path.dirname(__file__) +TEMPLATE_WITH_TOOL = load_file(os.path.join(PROMPT_DIR, 'template_with_tool.txt')) + + +class PromptTemplate: + """A prompt template.""" + + def __init__(self, template: str): + self.template: str = template + + def __call__(self, **kwargs) -> str: + return self.template.format(**kwargs) + + +class ToolPromptTemplate(PromptTemplate): + def __init__(self, use_tool: bool): + if use_tool: + template = TEMPLATE_WITH_TOOL + else: + raise NotImplementedError('Evaluation without tool is not supported yet.') + super().__init__(template) diff --git a/evaluation/mint/prompts/template_with_tool.txt b/evaluation/mint/prompts/template_with_tool.txt new file mode 100644 index 0000000000..b45dd2b32c --- /dev/null +++ b/evaluation/mint/prompts/template_with_tool.txt @@ -0,0 +1,19 @@ +You are a helpful assistant assigned with the task of problem-solving. +To solve the task, you can only interact with the interactive Python (Jupyter Notebook) environment using tag. Other tools cannot be used. +At each turn, you should first provide your step-by-step thinking for solving the task. Your thought process should be enclosed using "" tag, for example: I need to print "Hello World!" . + +After that, you have two options: +1) Interact with a Python programming environment and receive the corresponding output. +2) Directly provide a solution by sending your answer to user through message that adheres to the required format for the given task. Your solution should be enclosed using "" tag, for example: The answer is A . +Either you choose to interact with the Python environment or provide a solution, you need to send a message to the user to evaluate your response and provide feedback. + +You have {max_total_steps} chances to interact with the environment or propose a solution. You can only propose a solution {max_propose_solution} times. + +--- + +{in_context_example} + +--- + +# Problem statement: +{task_prompt} diff --git a/evaluation/mint/requirements.txt b/evaluation/mint/requirements.txt new file mode 100644 index 0000000000..97263cd2b8 --- /dev/null +++ b/evaluation/mint/requirements.txt @@ -0,0 +1,32 @@ +pre-commit +openai +datasets +backoff +charset-normalizer==3.1.0 +# Alfworld +pandas==1.4.4 +opencv-python +networkx +tqdm +vocab +revtok +Click +ai2thor==2.1.0 +transformers +tokenizers +scipy==1.10.1 +ipython +matplotlib +cython +nltk +gym==0.15.4 +pipreqs +pyyaml +pytz +visdom +sympy +pycocotools +seaborn +google-generativeai +python-dateutil +statsmodels diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py new file mode 100644 index 0000000000..6419317a5c --- /dev/null +++ b/evaluation/mint/run_infer.py @@ -0,0 +1,357 @@ +import asyncio +import functools +import json +import logging +import multiprocessing as mp +import os +import pathlib +import subprocess +import time +from concurrent.futures import ProcessPoolExecutor +from typing import Dict + +from datasets import load_dataset +from datatypes import TaskState +from env import SimplifiedEnv +from prompts import ToolPromptTemplate +from task import ReasoningTask, Task +from tqdm import tqdm + +from evaluation.swe_bench.swe_env_box import DockerSSHBox +from opendevin.controller.state.state import State +from opendevin.core.config import config, get_llm_config_arg, get_parser +from opendevin.core.logger import get_console_handler +from opendevin.core.logger import opendevin_logger as logger +from opendevin.core.main import main +from opendevin.events.serialization.event import event_to_dict + + +def cleanup(): + print('Cleaning up child processes...') + for process in mp.active_children(): + print(f'Terminating child process: {process.name}') + process.terminate() + process.join() + + +def codeact_user_response(state: State, task: Task, task_config: Dict[str, int]): + logger.info(f'Gold reference: {task.reference}') + logger.info(f'Task config: {task_config}') + + env = SimplifiedEnv( + agent_state=state, + task=task, + task_config=task_config, + ) + last_action, _ = state.history[-1] + result_state: TaskState = env.step(last_action.message) + state.task_state = result_state + + if not result_state.latest_output: + if result_state.success: + msg = 'Your answer is correct. Please EXIT using the following command: exit .' + else: + msg = 'Something went wrong! No output from the model.' + else: + msg = result_state.latest_output['content'] + + logger.info('User response:' + msg) + return msg + + +def monologue_user_response(state: State) -> str: + raise NotImplementedError('MonologueAgent should never ask for user responses.') + + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, + 'MonologueAgent': monologue_user_response, +} + +AGENT_CLS_TO_INST_SUFFIX = { + 'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: exit .\n' +} + + +def process_instance( + instance: Task, + agent_class, + metadata, + skip_workspace_mount, + eval_output_dir, + reset_logger: bool = True, +): + workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace') + # create process-specific workspace dir + # if `not skip_workspace_mount` - we will create a workspace directory for EACH process + # so that different agent don't interfere with each other. + if not skip_workspace_mount: + workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid())) + pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True) + + # Setup the logger properly, so you can run multi-processing to parallize the evaluation + if reset_logger: + # Set up logger + log_file = os.path.join( + eval_output_dir, 'logs', f'instance_{instance.task_id}.log' + ) + # Remove all existing handlers from logger + for handler in logger.handlers[:]: + logger.removeHandler(handler) + # add back the console handler to print ONE line + logger.addHandler(get_console_handler()) + logger.info( + f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell' + ) + # Remove all existing handlers from logger + for handler in logger.handlers[:]: + logger.removeHandler(handler) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter( + logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ) + logger.addHandler(file_handler) + + if not skip_workspace_mount: + logger.info(f'Process-specific workspace mounted at {workspace_mount_path}') + + sandbox = DockerSSHBox() + + requirements_host_src = 'evaluation/mint/requirements.txt' + requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt' + sandbox.copy_to( + host_src=requirements_host_src, + sandbox_dest=requirements_sandbox_dest, + recursive=False, + ) + logger.info( + f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.' + ) + exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}') + + # Prepare instruction + instruction = ToolPromptTemplate(use_tool=True)( + max_total_steps=metadata['max_iterations'], + max_propose_solution=metadata['max_propose_solution'], + in_context_example=instance.in_context_example( + use_tool=True, with_feedback=False + ), + task_prompt='Task:\n' + instance.prompt, + ) + instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the solution inside tag AND NEVER ASK FOR HUMAN HELP.\n' + + # NOTE: You can actually set slightly different instruction for different agents + instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '') + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + fake_user_response_fn = functools.partial( + AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class), + task=instance, + task_config={ + 'max_iterations': metadata['max_iterations'], + 'max_propose_solution': metadata['max_propose_solution'], + }, + ) + + state: State = asyncio.run( + main( + instruction, + fake_user_response_fn=fake_user_response_fn, + sandbox=sandbox, + ) + ) + + if state is None: + raise ValueError('State should not be None.') + + logger.info('Msgs: ' + str(state.history)) + + task_state: TaskState = state.task_state + logger.info('Task state: ' + str(task_state.to_dict())) + + # Save the output + output = { + 'id': instance.task_id, + 'instance': instance.to_dict(), + 'instruction': instruction, + 'metadata': metadata, + 'history': [ + (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history + ], + 'error': state.error if state and state.error else None, + 'test_result': task_state.success, + } + + # Close the sandbox + sandbox.close() + + return output + + +if __name__ == '__main__': + parser = get_parser() + + parser.add_argument( + '--subset', + default='math', + choices=['math', 'gsm8k'], + type=str, + help='subset of the dataset to be used', + ) + parser.add_argument( + '--max-propose-solution', + default=2, + type=int, + help='maximum number of times the agent can propose a solution', + ) + + args, _ = parser.parse_known_args() + + # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing + # so we don't need to manage file uploading to OpenDevin's repo + mint_dataset = load_dataset( + 'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test' + ) + logger.info(f'Evaluating MINT - {args.subset} subset') + + # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm + # for details of how to set `llm_config` + if args.llm_config: + specified_llm_config = get_llm_config_arg(args.llm_config) + if specified_llm_config: + config.llm = specified_llm_config + logger.info(f'Config for evaluation: {config}') + + # TEST METADATA + agent_class = args.agent_cls + assert ( + agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN + ), f'Unsupported agent class: {agent_class}' + model_name = config.llm.model.split('/')[-1] + max_iterations = args.max_iterations + eval_note = '' + if args.eval_note is not None: + eval_note += '_N_' + args.eval_note + eval_output_dir = os.path.join( + args.eval_output_dir, + 'mint', + agent_class, + model_name + '_maxiter_' + str(max_iterations) + eval_note, + args.subset, + ) + + pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True) + pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir( + parents=True, exist_ok=True + ) + logger.info(f'Using evaluation output directory: {eval_output_dir}') + + metadata = { + 'agent_class': agent_class, + 'model_name': model_name, + 'max_iterations': max_iterations, + 'max_propose_solution': args.max_propose_solution, + 'eval_output_dir': eval_output_dir, + 'start_time': time.strftime('%Y-%m-%d %H:%M:%S'), + # get the commit id of current repo for reproduciblity + 'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD']) + .decode('utf-8') + .strip(), + } + logger.info(f'Metadata: {metadata}') + with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f: + json.dump(metadata, f) + + # LIMIT EVALUATION + eval_n_limit = args.eval_n_limit + if eval_n_limit: + mint_dataset = mint_dataset.select(range(eval_n_limit)) + logger.info(f'Limiting evaluation to first {eval_n_limit} instances.') + + # OUTPUT FILE + output_file = os.path.join(eval_output_dir, 'output.jsonl') + logger.info(f'Writing evaluation output to {output_file}') + finished_instance_ids = set() + if os.path.exists(output_file): + with open(output_file, 'r') as f: + for line in f: + data = json.loads(line) + finished_instance_ids.add(data['id']) + logger.warning( + f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.' + ) + output_fp = open(output_file, 'a') + + logger.info( + f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}, max propose solution {args.max_propose_solution}.' + ) + + # ============================================= + # filter out finished instances + task_class = ReasoningTask + new_mint_tests: list[ReasoningTask] = [] + for instance in mint_dataset: + if instance['id'] in finished_instance_ids: + logger.info( + f'Skipping instance {instance["id"]} as it is already finished.' + ) + continue + # convert to Task object + instance = ReasoningTask(**instance) + new_mint_tests.append(instance) + + mint_dataset = new_mint_tests + logger.info( + f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(mint_dataset)}' + ) + # ============================================= + + pbar = tqdm(total=len(mint_dataset)) + + # This function tracks the progress AND write the output to a JSONL file + def update_progress(future): + pbar.update(1) + output = future.result() + # logger.info('Output: ', output) + # pbar.set_description(f'Instance {output["instance_id"]}') + # pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}') + # logger.info( + # f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}' + # ) + output_fp.write(json.dumps(output) + '\n') + output_fp.flush() + + # This sets the multi-processing + num_workers = args.eval_num_workers + logger.info(f'Using {num_workers} workers for evaluation.') + + # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work + skip_workspace_mount = agent_class == 'CodeActAgent' + logger.info(f'Skipping workspace mount: {skip_workspace_mount}') + + try: + with ProcessPoolExecutor(num_workers) as executor: + futures = [] + # This is how we perform multi-processing + for instance in mint_dataset: + future = executor.submit( + process_instance, + instance, + agent_class, + metadata, + skip_workspace_mount, + eval_output_dir, + reset_logger=bool(num_workers > 1), + ) + future.add_done_callback(update_progress) + futures.append(future) + + # Wait for all futures to complete + for future in futures: + future.result() + except KeyboardInterrupt: + print('KeyboardInterrupt received. Cleaning up...') + cleanup() + + output_fp.close() + logger.info('Evaluation finished.') diff --git a/evaluation/mint/scripts/run_infer.sh b/evaluation/mint/scripts/run_infer.sh new file mode 100644 index 0000000000..77d6a5797f --- /dev/null +++ b/evaluation/mint/scripts/run_infer.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +MODEL_CONFIG=$1 +SUBSET=$2 +EVAL_LIMIT=$3 +# Only 'CodeActAgent' is supported for MINT now +AGENT="CodeActAgent" + +# We need to track the version of Agent in the evaluation to make sure results are comparable +AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)") + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" + +export PYTHONPATH=$(pwd) + +COMMAND="poetry run python ./evaluation/mint/run_infer.py \ + --max-iterations 5 \ + --max-propose-solution 2 \ + --eval-note $AGENT_VERSION" + +if [ -n "$SUBSET" ]; then + echo "SUBSET: $SUBSET" + COMMAND="$COMMAND --subset $SUBSET" +# otherwise default to use the math subset +else + echo "SUBSET: math" + COMMAND="$COMMAND --subset math" +fi + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/evaluation/mint/task.py b/evaluation/mint/task.py new file mode 100644 index 0000000000..dc578cf125 --- /dev/null +++ b/evaluation/mint/task.py @@ -0,0 +1,121 @@ +import json +import logging +import os +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple + +from utils import load_file + +LOGGER = logging.getLogger('MINT') + + +class Task(ABC): + """Base class for a task instance.""" + + task_name: str = 'base' + in_context_example_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'in_context_examples', + ) + + def __init__(self, **kwargs) -> None: + if 'loaded_history' in kwargs: + self.loaded_history = kwargs['loaded_history'] + else: + self.loaded_history = None + # pre-load the in-context example + task_dir = os.path.join(self.in_context_example_dir, self.task_name) + self._in_context_example = { + 'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')), + } + self.metadata = {} + + @property + def task_id(self) -> str: + """Return the task id.""" + assert hasattr(self, '_id'), 'Task does not have an id.' + return self._id + + def in_context_example( + self, use_tool: bool = True, with_feedback: bool = False + ) -> str: + """Return the in-context example for the task.""" + if use_tool and not with_feedback: + return self._in_context_example['with_tool'] + else: + raise NotImplementedError + + @property + def prompt(self) -> str: + """Return the task prompt.""" + assert hasattr(self, '_prompt'), 'Task does not have a prompt.' + return self._prompt + + @property + def reference(self) -> str: + """Return the reference solution for the task.""" + assert hasattr(self, '_reference'), 'Task does not have a reference solution.' + return self._reference + + @abstractmethod + def extract_answer(self, solution: str) -> Optional[str]: + """Extract the answer from the given solution.""" + pass + + @abstractmethod + def success(self, solution: str) -> bool: + """This checks whether the given solution can complete the current task. + + Can be used to provide binary feedback. + """ + answer = self.extract_answer(solution) + return answer == self.reference + + @classmethod + def load_tasks(cls, path: str) -> Tuple[List['Task'], int]: + """Load all the tasks from a given jsonl file.""" + assert path.endswith('.jsonl') or path.endswith('.json') + with open(path, 'r') as f: + tasks = [cls(**json.loads(line)) for line in f.readlines()] + LOGGER.info(f'Loaded {len(tasks)} tasks from {path}') + return tasks, len(tasks) + + def to_dict(self) -> dict: + """Convert the task to a dictionary.""" + return { + 'task_name': self.task_name, + 'task_id': self.task_id, + 'prompt': self.prompt, + 'reference': self.reference, + 'metadata': self.metadata, + } + + +class ReasoningTask(Task): + task_name = 'reasoning' + + def __init__(self, id: str, prompt: str, reference: str, **kwargs): + super().__init__(**kwargs) + self._id = id + self._prompt = prompt.strip() + self._reference = str(reference).strip().lower() + + def extract_answer(self, solution: str) -> Optional[str]: + """Extract the answer from the given solution.""" + return solution.lower().strip() + + def compare_w_digits(self, reference: str, answer: str) -> bool: + """Compare the reference and answer with digits.""" + # if reference can and answer can both be converted to floats by float() + try: + float(reference) + float(answer) + return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference)) + except ValueError: + return reference in answer + except Exception: + raise ValueError(f'Cannot compare {reference} and {answer}') + + def success(self, solution: str) -> bool: + answer = self.extract_answer(solution) + return self.compare_w_digits(self._reference, answer) diff --git a/evaluation/mint/utils.py b/evaluation/mint/utils.py new file mode 100644 index 0000000000..3e8dd770c4 --- /dev/null +++ b/evaluation/mint/utils.py @@ -0,0 +1,10 @@ +import functools + + +# use cache to avoid loading the same file multiple times +# which can leads to too many open files error +@functools.lru_cache(maxsize=128) +def load_file(filepath: str) -> str: + with open(filepath, 'r') as f: + content = f.read() + return content