Support Entity-Deduction-Arena (EDA) Benchmark (#1931)

* adding draft evaluation code for EDA, using chatgpt as the temporal agent for now * Update README.md * Delete frontend/package.json * reverse the irrelevant changes * reverse package.json * use chatgpt as the codeactagent * integrate with opendevin * Update evaluation/EDA/README.md * Update evaluation/EDA/README.md * Use poetry to manage packages * integrate with opendevin * minor update * minor update * update poetry * update README * clean-up infer scripts * add run_infer script and improve readme * log final success and final message & ground truth --------- Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> Co-authored-by: yufansong <yufan@risingwave-labs.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2026-01-08 22:38:05 -05:00 · 2024-05-25 08:17:04 -07:00
parent 28ab00946b
commit 0c829cd067
7 changed files with 865 additions and 6 deletions
--- a/evaluation/EDA/README.md
+++ b/evaluation/EDA/README.md
@@ -0,0 +1,44 @@
+# EDA Evaluation
+
+This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
+
+## Start the evaluation
+
+
+```bash
+export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
+./evaluation/EDA/scripts/run_infer.sh [model_config] [agent] [dataset] [eval_limit]
+```
+
+where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+- `dataset`: There are two tasks in this evaluation. Specify `dataset` to test on either `things` or `celebs` task.
+
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
+
+Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
+then your command would be:
+
+```bash
+./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent things
+```
+
+## Reference
+```
+@inproceedings{zhang2023entity,
+  title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
+  author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
+  journal={ACL},
+  year={2024}
+}
+```
--- a/evaluation/EDA/game.py
+++ b/evaluation/EDA/game.py
@@ -0,0 +1,413 @@
+import json
+import logging
+import os
+import re
+from typing import Optional
+
+import openai
+import requests.exceptions
+import torch
+from openai import OpenAI
+from retry import retry
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+LOGGER = logging.getLogger(__name__)
+
+
+def load_model(path):
+    print('Loading model...')
+    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
+    print('Tokenizer loaded.')
+    model = AutoModelForCausalLM.from_pretrained(
+        path, low_cpu_mem_usage=True, torch_dtype=torch.float16
+    ).cuda()
+    print('Model loaded.')
+    # model.half().cuda()
+    return model, tokenizer
+
+
+class Q20Game:
+    def __init__(
+        self,
+        item: str,
+        answerer_model: str = 'gpt-3.5-turbo-0613',
+        guesser_model: str = 'gpt-3.5-turbo-0613',
+        num_turns: int = 20,
+        temperature: float = 0.8,
+        openai_api: bool = True,
+        openai_api_key: Optional[str] = None,
+        guesser_kargs={},
+    ) -> None:
+        self.item = item
+        self.answerer_model = answerer_model
+        self.guesser_model = guesser_model
+        self.num_turns = num_turns
+        self.temperature = temperature
+        self.openai_api = openai_api
+        self.guesser_kargs = guesser_kargs
+        self.vicuna_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+        self.first_user_utterance = (
+            'Your task is to ask a series of questions to deduce the entity '
+            "that I'm thinking of with as few queries as possible. "
+            "Only ask questions that can be answered by 'yes', 'no' or 'maybe'. "
+            'Do not ask for hint. Make your question brief with no linebreaker. '
+            'Now start asking a question.'
+        )
+        self.guesser_win = False
+        self.curr_turn = 0
+        if openai_api_key is not None:
+            openai.api_key = openai_api_key
+
+        if isinstance(answerer_model, str) and not answerer_model.startswith('gpt'):
+            self.user_api_base = 'http://0.0.0.0:8000/v1'
+        else:
+            self.user_api_base = 'https://api.openai.com/v1'
+
+        if isinstance(guesser_model, str) and not guesser_model.startswith('gpt'):
+            self.guesser_api_base = 'http://0.0.0.0:8000/v1'
+        else:
+            self.guesser_api_base = 'https://api.openai.com/v1'
+
+        self.guesser_messages = []
+
+    def confusion_matrix(self, path):
+        self.reset()
+        with open(path) as f:
+            raw_messages = json.load(f)
+            self.item = path.split('/')[-1].split('_')[0]
+            roles = ['assistant', 'user']
+            for i, message in enumerate(raw_messages):
+                self.guesser_messages.append(
+                    {'role': roles[i % 2], 'content': message['content']}
+                )
+
+        self.guesser_messages = self.guesser_messages[:-2]
+        self.guesser_messages[-1]['content'] = (
+            self.guesser_messages[-1]['content'] + " You must guess now, what's it?"
+        )
+        guesser_msg = self.guesser(self.guesser_messages)
+        self.guesser_messages.append(guesser_msg)
+        guesser_question = guesser_msg['content'].strip()
+        self.guesser_messages[-1]['content'] = (
+            self.guesser_messages[-1]['content'] + ' Is it right?'
+        )
+        usr_msg = self.answerer(guesser_question)
+        self.guesser_messages.append(
+            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
+        )
+
+        if 'bingo' in self.guesser_messages[-1]['content'].lower():
+            self.guesser_win = True
+            return True
+
+        return False
+
+    @retry(
+        (
+            openai.Timeout,
+            requests.exceptions.ReadTimeout,
+            openai.RateLimitError,
+            openai.APIError,
+            requests.exceptions.HTTPError,
+            openai.APIConnectionError,
+        ),
+        tries=5,
+        delay=0.5,
+        backoff=0.5,
+        max_delay=2,
+        logger=LOGGER,
+    )
+    def guesser(self, messages):
+        if not self.guesser_model.startswith('gpt'):  # hf model
+            self.guesser_model, self.guesser_tokenizer = load_model(self.guesser_model)
+
+            # """Wraps hf's `generate` adding some specific method's defaults"""
+            assert not self.openai_api
+            prompt = self.dialog_history() + ' ASSISTANT:'
+            input_ids = torch.tensor(
+                [self.guesser_tokenizer.encode(prompt, add_special_tokens=True)]
+            )  # TODO check if huggingface is using the same format.
+            input_ids = input_ids.to(self.guesser_model.base_model.device)
+            attention_mask = None
+
+            with torch.no_grad():
+                gen = self.guesser_model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    **self.guesser_kargs,
+                )
+                gen_str = (
+                    self.guesser_tokenizer.decode(gen[0][input_ids[0].shape[0] :])
+                    .split('</s>')[0]
+                    .split('USER')[0]
+                    .lstrip()
+                    .strip()
+                )
+
+                return {
+                    'role': 'assistant',
+                    'content': gen_str,
+                }
+        else:
+            openai.api_base = self.guesser_api_base
+            client = OpenAI(api_key=openai.api_key)
+            response = client.chat.completions.create(
+                model=self.guesser_model,
+                messages=messages,
+                max_tokens=64,
+                n=1,
+                stop=None,
+                temperature=self.temperature,
+            )
+            return {
+                'role': 'assistant',
+                'content': response.choices[0].message.to_dict()['content'].strip(),
+            }
+
+    def dialog_history(self):
+        history = self.vicuna_prompt + ' '
+        for item in self.guesser_messages:
+            if item['role'].upper() == 'USER':
+                history += 'USER: ' + item['content']
+            elif item['role'].upper() == 'ASSISTANT':
+                history += ' ' + 'ASSISTANT: ' + item['content'] + '</s>'
+        return history
+
+
+    def preprocess_response(self,response):
+        response = re.sub(
+            r'the entity you are thinking of', 'it', response
+        )
+        response = re.sub(
+            r"the entity you're thinking of", 'it', response
+        )
+        response = re.sub(
+            r" you're thinking of", '', response
+        )
+        response = re.sub(
+            r' you are thinking of', '', response
+        )
+        self.guesser_messages.append(response)
+        return response
+
+    def judge_winner(self, response):
+        guesser_question = response.strip()
+
+        if self.curr_turn == self.num_turns - 1:
+            guesser_question += ' Is it right?'
+        # ask for answer
+        usr_msg = self.answerer(guesser_question)
+
+        if 'bingo' in usr_msg['content'].lower():
+            self.guesser_win = True
+            return True, ""
+        
+        return False, usr_msg['content'].strip()
+    
+    def generate_user_response(self, response):
+        response = self.preprocess_response(response)
+        # others
+        bingo, anwser_reply = self.judge_winner(response)
+        if bingo:
+            return "You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n"
+        if self.curr_turn == self.num_turns - 2:
+            anwser_reply += " You must guess now, what's it?"
+        return anwser_reply
+
+    def game_play(self, user_mode=False):
+        self.reset()
+        # print(f"Item: {self.item}")
+        for t in range(self.num_turns):
+            # System asking a question
+            if (not user_mode) or user_mode is None:
+                guesser_msg = self.guesser(self.guesser_messages)
+                guesser_msg['content'] = re.sub(
+                    r'the entity you are thinking of', 'it', guesser_msg['content']
+                )
+                guesser_msg['content'] = re.sub(
+                    r"the entity you're thinking of", 'it', guesser_msg['content']
+                )
+                guesser_msg['content'] = re.sub(
+                    r" you're thinking of", '', guesser_msg['content']
+                )
+                guesser_msg['content'] = re.sub(
+                    r' you are thinking of', '', guesser_msg['content']
+                )
+            else:
+                user_q = input(
+                    f'Type in your questions for turn {t+1}. (e.g. Is it a living thing?)\n'
+                )
+                guesser_msg = {'role': 'assistant', 'content': user_q}
+            self.guesser_messages.append(guesser_msg)
+            guesser_question = guesser_msg['content'].strip()
+
+            if t == self.num_turns - 1:
+                self.guesser_messages[-1]['content'] = (
+                    self.guesser_messages[-1]['content'] + ' Is it right?'
+                )
+
+            usr_msg = self.answerer(guesser_question)
+            self.guesser_messages.append(
+                {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
+            )
+
+            if 'bingo' in usr_msg['content'].lower():
+                self.guesser_win = True
+                return True
+
+            if t == self.num_turns - 2:
+                self.guesser_messages[-1]['content'] = (
+                    self.guesser_messages[-1]['content']
+                    + " You must guess now, what's it?"
+                )
+
+        return False
+
+    def save_session(self, path):
+        # Print the conversation
+        if not os.path.exists(path):
+            os.makedirs(path)
+        output_file = os.path.join(path, f'{self.item}.txt')
+        with open(output_file, 'w') as out_f:
+            out_f.write(f'item: {self.item}\n')
+            for t, message in enumerate(self.guesser_messages):
+                out_f.write(
+                    f"Turn {(t+1)//2}, {message['role'].capitalize()}: {message['content'].lstrip()}\n"
+                )
+
+    def reward(self):
+        if self.guesser_win:
+            n_turns = (len(self.guesser_messages) + 1) // 2
+            return 1 - max(n_turns - 5, 0) * 0.02
+        return 0
+
+    def num_success(self):
+        return 1 if self.guesser_win else 0
+
+    def num_yes(self):
+        n_yes = sum(
+            ['yes' in msg['content'].lower() for msg in self.guesser_messages[2::2]]
+        )
+        return n_yes
+
+    @retry(
+        (
+            openai.Timeout,
+            requests.exceptions.ReadTimeout,
+            openai.RateLimitError,
+            openai.APIError,
+            openai.APIConnectionError,
+        ),
+        tries=5,
+        delay=0.5,
+        backoff=0.5,
+        max_delay=2,
+        logger=LOGGER,
+    )
+    def answerer(self, question):
+        openai.api_base = self.user_api_base
+        client = OpenAI(api_key=openai.api_key)
+        user_messages = [
+            {
+                'role': 'user',
+                'content': f'Based on your knowledge about {self.item}, '
+                f'respond to the following question or guess. '
+                f"Limit your respond to only 'Yes.', 'No.' or 'Maybe.', with no explanation or other words. "
+                f'Never say the answer {self.item} in your response. '
+                f"If the question is to solicit the answer, respond 'No.'.",
+            },
+            {
+                'role': 'user',
+                'content': f'For the entity {self.item}, {question} (Yes/No/Maybe)',
+            },
+        ]
+
+        response = client.chat.completions.create(
+            model=self.answerer_model,
+            messages=user_messages,
+            max_tokens=6,
+            n=1,
+            stop=None,
+            temperature=0.2,
+        )
+        if any(
+            [
+                re.search(rf'(?:^|\W){i.strip().lower()}(?:$|\W)', question.lower())
+                for i in self.item.lower().split('|')
+            ]
+        ):
+            response.choices[0].message.content = 'Bingo!'
+        return response.choices[0].message.to_dict()
+
+    def reset(self):
+        # Initialize the conversation
+        self.curr_turn = 0
+        self.guesser_messages = [
+            {
+                'role': 'user',
+                'content': self.first_user_utterance,
+            }
+        ]
+
+
+class Q20GameCelebrity(Q20Game):
+    def __init__(self, item: str, **kwargs) -> None:
+        super().__init__(item, **kwargs)
+        self.first_user_utterance = (
+            'Your task is to ask a series of questions to deduce the celebrity '
+            "that I'm thinking of with as few queries as possible. "
+            "Only ask factual questions that can be answered by 'Yes.', 'No.' or 'Dunno.'. Do not ask for hint. Make your question brief with no linebreaker. "
+            'Now start asking a question.'
+        )
+
+    @retry(
+        (
+            openai.Timeout,
+            requests.exceptions.ReadTimeout,
+            openai.RateLimitError,
+            openai.APIError,
+            openai.APIConnectionError,
+        ),
+        tries=5,
+        delay=0.5,
+        backoff=0.5,
+        max_delay=2,
+        logger=LOGGER,
+    )
+    def answerer(self, question):
+        openai.api_base = self.user_api_base
+        user_messages = [
+            {
+                'role': 'system',
+                'content': f'Based on on your knowledge about the celebrity: {self.item}, '
+                f'respond to the following question or guess. '
+                f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
+                f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
+                f"If the question is to solicit the answer, respond 'No.'.",
+            },
+            {
+                'role': 'user',
+                'content': f'For the celebrity {self.item}, {question}(Yes/No/Dunno)',
+            },
+        ]
+
+        response = openai.ChatCompletion.create(
+            model=self.answerer_model,
+            messages=user_messages,
+            max_tokens=6,
+            n=1,
+            stop=None,
+            temperature=0.2,
+        )
+        if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
+            response.choices[0].message.content = 'Bingo!'
+        return response.choices[0].message.to_dict()
+
+    def reset(self):
+        # Initialize the conversation
+        self.guesser_messages = [
+            {
+                'role': 'user',
+                'content': self.first_user_utterance,
+            }
+        ]
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -0,0 +1,329 @@
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import pathlib
+import subprocess
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+# import huggingface_hub
+from datasets import load_dataset
+from tqdm import tqdm
+
+from evaluation.EDA.game import Q20Game, Q20GameCelebrity
+
+# from evaluation.EDA.scorer import question_scorer
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, get_parser
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.action import MessageAction
+from opendevin.events.serialization.event import event_to_dict
+
+game = None
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def codeact_user_response(state: State) -> str:
+    global game
+    model_guess = ''
+    if state.history:
+        for act, _ in reversed(state.history):
+            if isinstance(act, MessageAction) and act.source == 'agent':
+                model_guess = act.content
+                break
+    msg = game.generate_user_response(model_guess)
+    game.curr_turn += 1
+    logger.info(f'Model guess: {model_guess}')
+    logger.info(f'Anwser response: {msg}')
+    return msg
+
+
+def monologue_user_response(state: State) -> str:
+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+    'MonologueAgent': monologue_user_response,
+}
+
+AGENT_CLS_TO_INST_SUFFIX = {
+    'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
+}
+
+
+def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    eval_output_dir = metadata['eval_output_dir']
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {instance["text"].strip()}.\nLOG:   tail -f {log_file}'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+
+    # Prepare instruction
+    _game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
+
+    guesser_kargs = {
+        'max_new_tokens': 64,
+        'temperature': 0.8,
+        'repetition_penalty': 1.0,
+        'do_sample': True,
+    }  # no penalty
+
+    # Use codeactagent as guesser_model
+    global game
+    game = _game_class[metadata['dataset']](
+        item=instance['text'].strip(),
+        answerer_model=metadata['answerer_model'],
+        guesser_model=None,
+        num_turns=metadata['max_iterations'],
+        openai_api_key=metadata['openai_api'],
+        guesser_kargs=guesser_kargs,
+    )
+
+    instruction = f'{game.first_user_utterance}'
+    logger.info(f'Instruction: {instruction}')
+
+    # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
+    # NOTE: You can actually set slightly different instruction for different agents
+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+
+    state: State = asyncio.run(
+        main(
+            instruction,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
+        )
+    )
+    # ======= Attempt to evaluate the agent's edits =======
+    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    final_message = ''
+    for act, _ in reversed(state.history):
+        if isinstance(act, MessageAction) and act.source == 'agent':
+            final_message = act.content
+            break
+
+    logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
+    test_result = game.reward()
+
+    # Save the output
+    output = {
+        'instance_id': instance['text'].strip(),
+        'instance': instance,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'error': state.error if state and state.error else None,
+        'test_result': {
+            'success': test_result,
+            'final_message': final_message,
+            'ground_truth': instance['text'],
+        },
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model'
+    )
+    parser.add_argument(
+        '--dataset',
+        default='things',
+        choices=['things', 'celebs'],
+        type=str,
+        help='dataset to be used',
+    )
+    parser.add_argument(
+        '--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
+    )
+    parser.add_argument(
+        '--data-split',
+        default='test',
+        type=str,
+        help='data split, eg, test',
+    )
+    args, _ = parser.parse_known_args()
+    if args.directory:
+        config.workspace_base = os.path.abspath(args.directory)
+        print(f'Setting workspace base to {config.workspace_base}')
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenDevin's repo
+    eda_dataset = load_dataset(
+        'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
+    )
+    logger.info(
+        f'Evaluating Entity Deduction Arena {args.dataset} {args.data_split} split'
+    )
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert (
+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
+    ), f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'eda',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'dataset': args.dataset,
+        'data_split': args.data_split,
+        'answerer_model': args.answerer_model,
+        'agent_class': agent_class,
+        'openai_api': args.OPENAI_API_KEY,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        eda_dataset = eda_dataset.select(list(range(eval_n_limit)))
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_items = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_items.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_items)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_eda_dataset = []
+    for instance in eda_dataset:
+        if instance['text'].strip() in finished_items:
+            logger.info(
+                f'Skipping instance {instance["text"].strip()} as it is already finished.'
+            )
+            continue
+        new_eda_dataset.append(instance)
+
+    eda_dataset = new_eda_dataset
+    logger.info(
+        f'Finished instances: {len(finished_items)}, Remaining instances: {len(eda_dataset)}'
+    )
+    # =============================================
+
+    pbar = tqdm(total=len(eda_dataset))
+
+    # This function tracks the progress AND write the output to a JSONL file
+    def update_progress(future):
+        pbar.update(1)
+        output = future.result()
+        pbar.set_description(f'Instance {output["instance_id"]}')
+        pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
+        logger.info(
+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}'
+        )
+        output_fp.write(json.dumps(output) + '\n')
+        output_fp.flush()
+
+    # This sets the multi-processing
+    num_workers = args.eval_num_workers
+    logger.info(f'Using {num_workers} workers for evaluation.')
+
+    try:
+        with ProcessPoolExecutor(num_workers) as executor:
+            futures = []
+            # This is how we perform multi-processing
+            for instance in eda_dataset:
+                future = executor.submit(
+                    process_instance,
+                    instance,
+                    agent_class,
+                    metadata,
+                    reset_logger=bool(num_workers > 1),
+                )
+                future.add_done_callback(update_progress)
+                futures.append(future)
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+    except KeyboardInterrupt:
+        print('KeyboardInterrupt received. Cleaning up...')
+        cleanup()
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+DATASET=$3
+EVAL_LIMIT=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$DATASET" ]; then
+  echo "Dataset not specified, use default 'things'"
+  DATASET="things"
+fi
+
+# check if OPENAI_API_KEY is set
+if [ -z "$OPENAI_API_KEY" ]; then
+  echo "OPENAI_API_KEY is not set, please set it to run the script"
+  exit 1
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+
+COMMAND="poetry run python evaluation/EDA/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --dataset $DATASET \
+  --data-split test \
+  --max-iterations 20 \
+  --OPENAI_API_KEY $OPENAI_API_KEY \
+  --max-chars 10000000 \
+  --eval-num-workers 1 \
+  --eval-note ${AGENT_VERSION}_${DATASET}"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -15,6 +15,7 @@ all the preprocessing/evaluation/analysis scripts.
 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
 - GAIA: [`evaluation/gaia`](./gaia)
+- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)

 ### Result Visualization

--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.

 [[package]]
 name = "aenum"
@@ -1169,6 +1169,17 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch
 torch = ["torch"]
 vision = ["Pillow (>=6.2.1)"]

+[[package]]
+name = "decorator"
+version = "5.1.1"
+description = "Decorators for Humans"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
+]
+
 [[package]]
 name = "deprecated"
 version = "1.2.14"
@@ -3093,13 +3104,9 @@ files = [
    {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
    {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
    {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
    {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
@@ -5497,6 +5504,21 @@ requests = ">=2.0.0"
 [package.extras]
 rsa = ["oauthlib[signedtoken] (>=3.0.0)"]

+[[package]]
+name = "retry"
+version = "0.9.2"
+description = "Easy to use retry decorator."
+optional = false
+python-versions = "*"
+files = [
+    {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"},
+    {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"},
+]
+
+[package.dependencies]
+decorator = ">=3.4.2"
+py = ">=1.4.26,<2.0.0"
+
 [[package]]
 name = "rich"
 version = "13.7.1"
@@ -7526,4 +7548,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "141771396f59fc23d52623ada07e4b89272ca781e5a2072f98ebccdf3f18a43b"
+content-hash = "70be72e8064824ea756bf2543c8588e266a980e0e6dbc1fc50eecfb365c707d9"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,6 +69,7 @@ concurrency = ["gevent"]
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
+retry = "*"
 evaluate = "*"

 [build-system]