diff --git a/evaluation/EDA/README.md b/evaluation/EDA/README.md new file mode 100644 index 0000000000..54dfed1994 --- /dev/null +++ b/evaluation/EDA/README.md @@ -0,0 +1,44 @@ +# EDA Evaluation + +This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference. + +## Configure OpenDevin and your LLM + +Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up. + +## Start the evaluation + + +```bash +export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation) +./evaluation/EDA/scripts/run_infer.sh [model_config] [agent] [dataset] [eval_limit] +``` + +where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional. + +- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your +LLM settings, as defined in your `config.toml`. + +- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting +to `CodeActAgent`. + +- `dataset`: There are two tasks in this evaluation. Specify `dataset` to test on either `things` or `celebs` task. + +- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances. + +Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent, +then your command would be: + +```bash +./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent things +``` + +## Reference +``` +@inproceedings{zhang2023entity, + title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games}, + author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep}, + journal={ACL}, + year={2024} +} +``` diff --git a/evaluation/EDA/game.py b/evaluation/EDA/game.py new file mode 100644 index 0000000000..3fb0ecc5c7 --- /dev/null +++ b/evaluation/EDA/game.py @@ -0,0 +1,413 @@ +import json +import logging +import os +import re +from typing import Optional + +import openai +import requests.exceptions +import torch +from openai import OpenAI +from retry import retry +from transformers import AutoModelForCausalLM, AutoTokenizer + +LOGGER = logging.getLogger(__name__) + + +def load_model(path): + print('Loading model...') + tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False) + print('Tokenizer loaded.') + model = AutoModelForCausalLM.from_pretrained( + path, low_cpu_mem_usage=True, torch_dtype=torch.float16 + ).cuda() + print('Model loaded.') + # model.half().cuda() + return model, tokenizer + + +class Q20Game: + def __init__( + self, + item: str, + answerer_model: str = 'gpt-3.5-turbo-0613', + guesser_model: str = 'gpt-3.5-turbo-0613', + num_turns: int = 20, + temperature: float = 0.8, + openai_api: bool = True, + openai_api_key: Optional[str] = None, + guesser_kargs={}, + ) -> None: + self.item = item + self.answerer_model = answerer_model + self.guesser_model = guesser_model + self.num_turns = num_turns + self.temperature = temperature + self.openai_api = openai_api + self.guesser_kargs = guesser_kargs + self.vicuna_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions." + self.first_user_utterance = ( + 'Your task is to ask a series of questions to deduce the entity ' + "that I'm thinking of with as few queries as possible. " + "Only ask questions that can be answered by 'yes', 'no' or 'maybe'. " + 'Do not ask for hint. Make your question brief with no linebreaker. ' + 'Now start asking a question.' + ) + self.guesser_win = False + self.curr_turn = 0 + if openai_api_key is not None: + openai.api_key = openai_api_key + + if isinstance(answerer_model, str) and not answerer_model.startswith('gpt'): + self.user_api_base = 'http://0.0.0.0:8000/v1' + else: + self.user_api_base = 'https://api.openai.com/v1' + + if isinstance(guesser_model, str) and not guesser_model.startswith('gpt'): + self.guesser_api_base = 'http://0.0.0.0:8000/v1' + else: + self.guesser_api_base = 'https://api.openai.com/v1' + + self.guesser_messages = [] + + def confusion_matrix(self, path): + self.reset() + with open(path) as f: + raw_messages = json.load(f) + self.item = path.split('/')[-1].split('_')[0] + roles = ['assistant', 'user'] + for i, message in enumerate(raw_messages): + self.guesser_messages.append( + {'role': roles[i % 2], 'content': message['content']} + ) + + self.guesser_messages = self.guesser_messages[:-2] + self.guesser_messages[-1]['content'] = ( + self.guesser_messages[-1]['content'] + " You must guess now, what's it?" + ) + guesser_msg = self.guesser(self.guesser_messages) + self.guesser_messages.append(guesser_msg) + guesser_question = guesser_msg['content'].strip() + self.guesser_messages[-1]['content'] = ( + self.guesser_messages[-1]['content'] + ' Is it right?' + ) + usr_msg = self.answerer(guesser_question) + self.guesser_messages.append( + {'role': 'user', 'content': f"{usr_msg['content'].strip()}"} + ) + + if 'bingo' in self.guesser_messages[-1]['content'].lower(): + self.guesser_win = True + return True + + return False + + @retry( + ( + openai.Timeout, + requests.exceptions.ReadTimeout, + openai.RateLimitError, + openai.APIError, + requests.exceptions.HTTPError, + openai.APIConnectionError, + ), + tries=5, + delay=0.5, + backoff=0.5, + max_delay=2, + logger=LOGGER, + ) + def guesser(self, messages): + if not self.guesser_model.startswith('gpt'): # hf model + self.guesser_model, self.guesser_tokenizer = load_model(self.guesser_model) + + # """Wraps hf's `generate` adding some specific method's defaults""" + assert not self.openai_api + prompt = self.dialog_history() + ' ASSISTANT:' + input_ids = torch.tensor( + [self.guesser_tokenizer.encode(prompt, add_special_tokens=True)] + ) # TODO check if huggingface is using the same format. + input_ids = input_ids.to(self.guesser_model.base_model.device) + attention_mask = None + + with torch.no_grad(): + gen = self.guesser_model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + **self.guesser_kargs, + ) + gen_str = ( + self.guesser_tokenizer.decode(gen[0][input_ids[0].shape[0] :]) + .split('')[0] + .split('USER')[0] + .lstrip() + .strip() + ) + + return { + 'role': 'assistant', + 'content': gen_str, + } + else: + openai.api_base = self.guesser_api_base + client = OpenAI(api_key=openai.api_key) + response = client.chat.completions.create( + model=self.guesser_model, + messages=messages, + max_tokens=64, + n=1, + stop=None, + temperature=self.temperature, + ) + return { + 'role': 'assistant', + 'content': response.choices[0].message.to_dict()['content'].strip(), + } + + def dialog_history(self): + history = self.vicuna_prompt + ' ' + for item in self.guesser_messages: + if item['role'].upper() == 'USER': + history += 'USER: ' + item['content'] + elif item['role'].upper() == 'ASSISTANT': + history += ' ' + 'ASSISTANT: ' + item['content'] + '' + return history + + + def preprocess_response(self,response): + response = re.sub( + r'the entity you are thinking of', 'it', response + ) + response = re.sub( + r"the entity you're thinking of", 'it', response + ) + response = re.sub( + r" you're thinking of", '', response + ) + response = re.sub( + r' you are thinking of', '', response + ) + self.guesser_messages.append(response) + return response + + def judge_winner(self, response): + guesser_question = response.strip() + + if self.curr_turn == self.num_turns - 1: + guesser_question += ' Is it right?' + # ask for answer + usr_msg = self.answerer(guesser_question) + + if 'bingo' in usr_msg['content'].lower(): + self.guesser_win = True + return True, "" + + return False, usr_msg['content'].strip() + + def generate_user_response(self, response): + response = self.preprocess_response(response) + # others + bingo, anwser_reply = self.judge_winner(response) + if bingo: + return "You are bingo! quit now, run: exit .\n" + if self.curr_turn == self.num_turns - 2: + anwser_reply += " You must guess now, what's it?" + return anwser_reply + + def game_play(self, user_mode=False): + self.reset() + # print(f"Item: {self.item}") + for t in range(self.num_turns): + # System asking a question + if (not user_mode) or user_mode is None: + guesser_msg = self.guesser(self.guesser_messages) + guesser_msg['content'] = re.sub( + r'the entity you are thinking of', 'it', guesser_msg['content'] + ) + guesser_msg['content'] = re.sub( + r"the entity you're thinking of", 'it', guesser_msg['content'] + ) + guesser_msg['content'] = re.sub( + r" you're thinking of", '', guesser_msg['content'] + ) + guesser_msg['content'] = re.sub( + r' you are thinking of', '', guesser_msg['content'] + ) + else: + user_q = input( + f'Type in your questions for turn {t+1}. (e.g. Is it a living thing?)\n' + ) + guesser_msg = {'role': 'assistant', 'content': user_q} + self.guesser_messages.append(guesser_msg) + guesser_question = guesser_msg['content'].strip() + + if t == self.num_turns - 1: + self.guesser_messages[-1]['content'] = ( + self.guesser_messages[-1]['content'] + ' Is it right?' + ) + + usr_msg = self.answerer(guesser_question) + self.guesser_messages.append( + {'role': 'user', 'content': f"{usr_msg['content'].strip()}"} + ) + + if 'bingo' in usr_msg['content'].lower(): + self.guesser_win = True + return True + + if t == self.num_turns - 2: + self.guesser_messages[-1]['content'] = ( + self.guesser_messages[-1]['content'] + + " You must guess now, what's it?" + ) + + return False + + def save_session(self, path): + # Print the conversation + if not os.path.exists(path): + os.makedirs(path) + output_file = os.path.join(path, f'{self.item}.txt') + with open(output_file, 'w') as out_f: + out_f.write(f'item: {self.item}\n') + for t, message in enumerate(self.guesser_messages): + out_f.write( + f"Turn {(t+1)//2}, {message['role'].capitalize()}: {message['content'].lstrip()}\n" + ) + + def reward(self): + if self.guesser_win: + n_turns = (len(self.guesser_messages) + 1) // 2 + return 1 - max(n_turns - 5, 0) * 0.02 + return 0 + + def num_success(self): + return 1 if self.guesser_win else 0 + + def num_yes(self): + n_yes = sum( + ['yes' in msg['content'].lower() for msg in self.guesser_messages[2::2]] + ) + return n_yes + + @retry( + ( + openai.Timeout, + requests.exceptions.ReadTimeout, + openai.RateLimitError, + openai.APIError, + openai.APIConnectionError, + ), + tries=5, + delay=0.5, + backoff=0.5, + max_delay=2, + logger=LOGGER, + ) + def answerer(self, question): + openai.api_base = self.user_api_base + client = OpenAI(api_key=openai.api_key) + user_messages = [ + { + 'role': 'user', + 'content': f'Based on your knowledge about {self.item}, ' + f'respond to the following question or guess. ' + f"Limit your respond to only 'Yes.', 'No.' or 'Maybe.', with no explanation or other words. " + f'Never say the answer {self.item} in your response. ' + f"If the question is to solicit the answer, respond 'No.'.", + }, + { + 'role': 'user', + 'content': f'For the entity {self.item}, {question} (Yes/No/Maybe)', + }, + ] + + response = client.chat.completions.create( + model=self.answerer_model, + messages=user_messages, + max_tokens=6, + n=1, + stop=None, + temperature=0.2, + ) + if any( + [ + re.search(rf'(?:^|\W){i.strip().lower()}(?:$|\W)', question.lower()) + for i in self.item.lower().split('|') + ] + ): + response.choices[0].message.content = 'Bingo!' + return response.choices[0].message.to_dict() + + def reset(self): + # Initialize the conversation + self.curr_turn = 0 + self.guesser_messages = [ + { + 'role': 'user', + 'content': self.first_user_utterance, + } + ] + + +class Q20GameCelebrity(Q20Game): + def __init__(self, item: str, **kwargs) -> None: + super().__init__(item, **kwargs) + self.first_user_utterance = ( + 'Your task is to ask a series of questions to deduce the celebrity ' + "that I'm thinking of with as few queries as possible. " + "Only ask factual questions that can be answered by 'Yes.', 'No.' or 'Dunno.'. Do not ask for hint. Make your question brief with no linebreaker. " + 'Now start asking a question.' + ) + + @retry( + ( + openai.Timeout, + requests.exceptions.ReadTimeout, + openai.RateLimitError, + openai.APIError, + openai.APIConnectionError, + ), + tries=5, + delay=0.5, + backoff=0.5, + max_delay=2, + logger=LOGGER, + ) + def answerer(self, question): + openai.api_base = self.user_api_base + user_messages = [ + { + 'role': 'system', + 'content': f'Based on on your knowledge about the celebrity: {self.item}, ' + f'respond to the following question or guess. ' + f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. " + f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' " + f"If the question is to solicit the answer, respond 'No.'.", + }, + { + 'role': 'user', + 'content': f'For the celebrity {self.item}, {question}(Yes/No/Dunno)', + }, + ] + + response = openai.ChatCompletion.create( + model=self.answerer_model, + messages=user_messages, + max_tokens=6, + n=1, + stop=None, + temperature=0.2, + ) + if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()): + response.choices[0].message.content = 'Bingo!' + return response.choices[0].message.to_dict() + + def reset(self): + # Initialize the conversation + self.guesser_messages = [ + { + 'role': 'user', + 'content': self.first_user_utterance, + } + ] diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py new file mode 100644 index 0000000000..d66c3a13c9 --- /dev/null +++ b/evaluation/EDA/run_infer.py @@ -0,0 +1,329 @@ +import asyncio +import json +import logging +import multiprocessing as mp +import os +import pathlib +import subprocess +import time +from concurrent.futures import ProcessPoolExecutor + +# import huggingface_hub +from datasets import load_dataset +from tqdm import tqdm + +from evaluation.EDA.game import Q20Game, Q20GameCelebrity + +# from evaluation.EDA.scorer import question_scorer +from opendevin.controller.state.state import State +from opendevin.core.config import config, get_llm_config_arg, get_parser +from opendevin.core.logger import get_console_handler +from opendevin.core.logger import opendevin_logger as logger +from opendevin.core.main import main +from opendevin.events.action import MessageAction +from opendevin.events.serialization.event import event_to_dict + +game = None + + +def cleanup(): + print('Cleaning up child processes...') + for process in mp.active_children(): + print(f'Terminating child process: {process.name}') + process.terminate() + process.join() + + +def codeact_user_response(state: State) -> str: + global game + model_guess = '' + if state.history: + for act, _ in reversed(state.history): + if isinstance(act, MessageAction) and act.source == 'agent': + model_guess = act.content + break + msg = game.generate_user_response(model_guess) + game.curr_turn += 1 + logger.info(f'Model guess: {model_guess}') + logger.info(f'Anwser response: {msg}') + return msg + + +def monologue_user_response(state: State) -> str: + raise NotImplementedError('MonologueAgent should never ask for user responses.') + + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, + 'MonologueAgent': monologue_user_response, +} + +AGENT_CLS_TO_INST_SUFFIX = { + 'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n' +} + + +def process_instance(instance, agent_class, metadata, reset_logger: bool = True): + # Setup the logger properly, so you can run multi-processing to parallize the evaluation + eval_output_dir = metadata['eval_output_dir'] + if reset_logger: + # Set up logger + log_file = os.path.join( + eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log' + ) + # Remove all existing handlers from logger + for handler in logger.handlers[:]: + logger.removeHandler(handler) + # add back the console handler to print ONE line + logger.addHandler(get_console_handler()) + logger.info( + f'Starting evaluation for instance {instance["text"].strip()}.\nLOG: tail -f {log_file}' + ) + # Remove all existing handlers from logger + for handler in logger.handlers[:]: + logger.removeHandler(handler) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter( + logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ) + logger.addHandler(file_handler) + + # Prepare instruction + _game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity} + + guesser_kargs = { + 'max_new_tokens': 64, + 'temperature': 0.8, + 'repetition_penalty': 1.0, + 'do_sample': True, + } # no penalty + + # Use codeactagent as guesser_model + global game + game = _game_class[metadata['dataset']]( + item=instance['text'].strip(), + answerer_model=metadata['answerer_model'], + guesser_model=None, + num_turns=metadata['max_iterations'], + openai_api_key=metadata['openai_api'], + guesser_kargs=guesser_kargs, + ) + + instruction = f'{game.first_user_utterance}' + logger.info(f'Instruction: {instruction}') + + # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n' + # NOTE: You can actually set slightly different instruction for different agents + instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '') + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + + state: State = asyncio.run( + main( + instruction, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class), + ) + ) + # ======= Attempt to evaluate the agent's edits ======= + # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction) + # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. + + if state is None: + raise ValueError('State should not be None.') + + final_message = '' + for act, _ in reversed(state.history): + if isinstance(act, MessageAction) and act.source == 'agent': + final_message = act.content + break + + logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}') + test_result = game.reward() + + # Save the output + output = { + 'instance_id': instance['text'].strip(), + 'instance': instance, + 'instruction': instruction, + 'metadata': metadata, + 'history': [ + (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history + ], + 'error': state.error if state and state.error else None, + 'test_result': { + 'success': test_result, + 'final_message': final_message, + 'ground_truth': instance['text'], + }, + } + + return output + + +if __name__ == '__main__': + parser = get_parser() + parser.add_argument( + '--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model' + ) + parser.add_argument( + '--dataset', + default='things', + choices=['things', 'celebs'], + type=str, + help='dataset to be used', + ) + parser.add_argument( + '--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key' + ) + parser.add_argument( + '--data-split', + default='test', + type=str, + help='data split, eg, test', + ) + args, _ = parser.parse_known_args() + if args.directory: + config.workspace_base = os.path.abspath(args.directory) + print(f'Setting workspace base to {config.workspace_base}') + # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing + # so we don't need to manage file uploading to OpenDevin's repo + eda_dataset = load_dataset( + 'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split + ) + logger.info( + f'Evaluating Entity Deduction Arena {args.dataset} {args.data_split} split' + ) + + # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm + # for details of how to set `llm_config` + if args.llm_config: + specified_llm_config = get_llm_config_arg(args.llm_config) + if specified_llm_config: + config.llm = specified_llm_config + logger.info(f'Config for evaluation: {config}') + + # TEST METADATA + agent_class = args.agent_cls + assert ( + agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN + ), f'Unsupported agent class: {agent_class}' + model_name = config.llm.model.split('/')[-1] + max_iterations = args.max_iterations + eval_note = '' + if args.eval_note is not None: + eval_note += '_N_' + args.eval_note + eval_output_dir = os.path.join( + args.eval_output_dir, + 'eda', + agent_class, + model_name + '_maxiter_' + str(max_iterations) + eval_note, + ) + + pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True) + pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir( + parents=True, exist_ok=True + ) + logger.info(f'Using evaluation output directory: {eval_output_dir}') + + metadata = { + 'dataset': args.dataset, + 'data_split': args.data_split, + 'answerer_model': args.answerer_model, + 'agent_class': agent_class, + 'openai_api': args.OPENAI_API_KEY, + 'model_name': model_name, + 'max_iterations': max_iterations, + 'eval_output_dir': eval_output_dir, + 'start_time': time.strftime('%Y-%m-%d %H:%M:%S'), + # get the commit id of current repo for reproduciblity + 'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD']) + .decode('utf-8') + .strip(), + } + logger.info(f'Metadata: {metadata}') + with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f: + json.dump(metadata, f) + + # LIMIT EVALUATION + eval_n_limit = args.eval_n_limit + if eval_n_limit: + eda_dataset = eda_dataset.select(list(range(eval_n_limit))) + logger.info(f'Limiting evaluation to first {eval_n_limit} instances.') + + # OUTPUT FILE + output_file = os.path.join(eval_output_dir, 'output.jsonl') + logger.info(f'Writing evaluation output to {output_file}') + finished_items = set() + if os.path.exists(output_file): + with open(output_file, 'r') as f: + for line in f: + data = json.loads(line) + finished_items.add(data['instance_id']) + logger.warning( + f'Output file {output_file} already exists. Loaded {len(finished_items)} finished instances.' + ) + output_fp = open(output_file, 'a') + + logger.info( + f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.' + ) + + # ============================================= + # filter out finished instances + new_eda_dataset = [] + for instance in eda_dataset: + if instance['text'].strip() in finished_items: + logger.info( + f'Skipping instance {instance["text"].strip()} as it is already finished.' + ) + continue + new_eda_dataset.append(instance) + + eda_dataset = new_eda_dataset + logger.info( + f'Finished instances: {len(finished_items)}, Remaining instances: {len(eda_dataset)}' + ) + # ============================================= + + pbar = tqdm(total=len(eda_dataset)) + + # This function tracks the progress AND write the output to a JSONL file + def update_progress(future): + pbar.update(1) + output = future.result() + pbar.set_description(f'Instance {output["instance_id"]}') + pbar.set_postfix_str(f'Test Result: {output["test_result"]}') + logger.info( + f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}' + ) + output_fp.write(json.dumps(output) + '\n') + output_fp.flush() + + # This sets the multi-processing + num_workers = args.eval_num_workers + logger.info(f'Using {num_workers} workers for evaluation.') + + try: + with ProcessPoolExecutor(num_workers) as executor: + futures = [] + # This is how we perform multi-processing + for instance in eda_dataset: + future = executor.submit( + process_instance, + instance, + agent_class, + metadata, + reset_logger=bool(num_workers > 1), + ) + future.add_done_callback(update_progress) + futures.append(future) + + # Wait for all futures to complete + for future in futures: + future.result() + except KeyboardInterrupt: + print('KeyboardInterrupt received. Cleaning up...') + cleanup() + + output_fp.close() + logger.info('Evaluation finished.') diff --git a/evaluation/EDA/scripts/run_infer.sh b/evaluation/EDA/scripts/run_infer.sh new file mode 100755 index 0000000000..4a35f2680e --- /dev/null +++ b/evaluation/EDA/scripts/run_infer.sh @@ -0,0 +1,49 @@ +#!/bin/bash +MODEL_CONFIG=$1 +AGENT=$2 +DATASET=$3 +EVAL_LIMIT=$4 + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +if [ -z "$DATASET" ]; then + echo "Dataset not specified, use default 'things'" + DATASET="things" +fi + +# check if OPENAI_API_KEY is set +if [ -z "$OPENAI_API_KEY" ]; then + echo "OPENAI_API_KEY is not set, please set it to run the script" + exit 1 +fi + +# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin +# We need to track the version of Agent in the evaluation to make sure results are comparable +AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)") + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" +echo "DATASET: $DATASET" + +COMMAND="poetry run python evaluation/EDA/run_infer.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --dataset $DATASET \ + --data-split test \ + --max-iterations 20 \ + --OPENAI_API_KEY $OPENAI_API_KEY \ + --max-chars 10000000 \ + --eval-num-workers 1 \ + --eval-note ${AGENT_VERSION}_${DATASET}" + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/evaluation/README.md b/evaluation/README.md index 38baaae933..047d09c4e8 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -15,6 +15,7 @@ all the preprocessing/evaluation/analysis scripts. - SWE-Bench: [`evaluation/swe_bench`](./swe_bench) - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix) - GAIA: [`evaluation/gaia`](./gaia) +- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA) ### Result Visualization diff --git a/poetry.lock b/poetry.lock index a6184934f0..7167825d3e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aenum" @@ -1169,6 +1169,17 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch torch = ["torch"] vision = ["Pillow (>=6.2.1)"] +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +optional = false +python-versions = ">=3.5" +files = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] + [[package]] name = "deprecated" version = "1.2.14" @@ -3093,13 +3104,9 @@ files = [ {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"}, {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"}, {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"}, {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"}, @@ -5497,6 +5504,21 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] +[[package]] +name = "retry" +version = "0.9.2" +description = "Easy to use retry decorator." +optional = false +python-versions = "*" +files = [ + {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"}, + {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"}, +] + +[package.dependencies] +decorator = ">=3.4.2" +py = ">=1.4.26,<2.0.0" + [[package]] name = "rich" version = "13.7.1" @@ -7526,4 +7548,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "141771396f59fc23d52623ada07e4b89272ca781e5a2072f98ebccdf3f18a43b" +content-hash = "70be72e8064824ea756bf2543c8588e266a980e0e6dbc1fc50eecfb365c707d9" diff --git a/pyproject.toml b/pyproject.toml index 4822b8a6e7..5487c8e7e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ concurrency = ["gevent"] [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" +retry = "*" evaluate = "*" [build-system]