mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-08 22:38:05 -05:00
Support Entity-Deduction-Arena (EDA) Benchmark (#1931)
* adding draft evaluation code for EDA, using chatgpt as the temporal agent for now * Update README.md * Delete frontend/package.json * reverse the irrelevant changes * reverse package.json * use chatgpt as the codeactagent * integrate with opendevin * Update evaluation/EDA/README.md * Update evaluation/EDA/README.md * Use poetry to manage packages * integrate with opendevin * minor update * minor update * update poetry * update README * clean-up infer scripts * add run_infer script and improve readme * log final success and final message & ground truth --------- Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> Co-authored-by: yufansong <yufan@risingwave-labs.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
44
evaluation/EDA/README.md
Normal file
44
evaluation/EDA/README.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# EDA Evaluation
|
||||
|
||||
This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
|
||||
./evaluation/EDA/scripts/run_infer.sh [model_config] [agent] [dataset] [eval_limit]
|
||||
```
|
||||
|
||||
where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
|
||||
|
||||
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
|
||||
LLM settings, as defined in your `config.toml`.
|
||||
|
||||
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
|
||||
to `CodeActAgent`.
|
||||
|
||||
- `dataset`: There are two tasks in this evaluation. Specify `dataset` to test on either `things` or `celebs` task.
|
||||
|
||||
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
|
||||
|
||||
Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
|
||||
then your command would be:
|
||||
|
||||
```bash
|
||||
./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent things
|
||||
```
|
||||
|
||||
## Reference
|
||||
```
|
||||
@inproceedings{zhang2023entity,
|
||||
title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
|
||||
author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
|
||||
journal={ACL},
|
||||
year={2024}
|
||||
}
|
||||
```
|
||||
413
evaluation/EDA/game.py
Normal file
413
evaluation/EDA/game.py
Normal file
@@ -0,0 +1,413 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import openai
|
||||
import requests.exceptions
|
||||
import torch
|
||||
from openai import OpenAI
|
||||
from retry import retry
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_model(path):
|
||||
print('Loading model...')
|
||||
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
|
||||
print('Tokenizer loaded.')
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
path, low_cpu_mem_usage=True, torch_dtype=torch.float16
|
||||
).cuda()
|
||||
print('Model loaded.')
|
||||
# model.half().cuda()
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
class Q20Game:
|
||||
def __init__(
|
||||
self,
|
||||
item: str,
|
||||
answerer_model: str = 'gpt-3.5-turbo-0613',
|
||||
guesser_model: str = 'gpt-3.5-turbo-0613',
|
||||
num_turns: int = 20,
|
||||
temperature: float = 0.8,
|
||||
openai_api: bool = True,
|
||||
openai_api_key: Optional[str] = None,
|
||||
guesser_kargs={},
|
||||
) -> None:
|
||||
self.item = item
|
||||
self.answerer_model = answerer_model
|
||||
self.guesser_model = guesser_model
|
||||
self.num_turns = num_turns
|
||||
self.temperature = temperature
|
||||
self.openai_api = openai_api
|
||||
self.guesser_kargs = guesser_kargs
|
||||
self.vicuna_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||
self.first_user_utterance = (
|
||||
'Your task is to ask a series of questions to deduce the entity '
|
||||
"that I'm thinking of with as few queries as possible. "
|
||||
"Only ask questions that can be answered by 'yes', 'no' or 'maybe'. "
|
||||
'Do not ask for hint. Make your question brief with no linebreaker. '
|
||||
'Now start asking a question.'
|
||||
)
|
||||
self.guesser_win = False
|
||||
self.curr_turn = 0
|
||||
if openai_api_key is not None:
|
||||
openai.api_key = openai_api_key
|
||||
|
||||
if isinstance(answerer_model, str) and not answerer_model.startswith('gpt'):
|
||||
self.user_api_base = 'http://0.0.0.0:8000/v1'
|
||||
else:
|
||||
self.user_api_base = 'https://api.openai.com/v1'
|
||||
|
||||
if isinstance(guesser_model, str) and not guesser_model.startswith('gpt'):
|
||||
self.guesser_api_base = 'http://0.0.0.0:8000/v1'
|
||||
else:
|
||||
self.guesser_api_base = 'https://api.openai.com/v1'
|
||||
|
||||
self.guesser_messages = []
|
||||
|
||||
def confusion_matrix(self, path):
|
||||
self.reset()
|
||||
with open(path) as f:
|
||||
raw_messages = json.load(f)
|
||||
self.item = path.split('/')[-1].split('_')[0]
|
||||
roles = ['assistant', 'user']
|
||||
for i, message in enumerate(raw_messages):
|
||||
self.guesser_messages.append(
|
||||
{'role': roles[i % 2], 'content': message['content']}
|
||||
)
|
||||
|
||||
self.guesser_messages = self.guesser_messages[:-2]
|
||||
self.guesser_messages[-1]['content'] = (
|
||||
self.guesser_messages[-1]['content'] + " You must guess now, what's it?"
|
||||
)
|
||||
guesser_msg = self.guesser(self.guesser_messages)
|
||||
self.guesser_messages.append(guesser_msg)
|
||||
guesser_question = guesser_msg['content'].strip()
|
||||
self.guesser_messages[-1]['content'] = (
|
||||
self.guesser_messages[-1]['content'] + ' Is it right?'
|
||||
)
|
||||
usr_msg = self.answerer(guesser_question)
|
||||
self.guesser_messages.append(
|
||||
{'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
|
||||
)
|
||||
|
||||
if 'bingo' in self.guesser_messages[-1]['content'].lower():
|
||||
self.guesser_win = True
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@retry(
|
||||
(
|
||||
openai.Timeout,
|
||||
requests.exceptions.ReadTimeout,
|
||||
openai.RateLimitError,
|
||||
openai.APIError,
|
||||
requests.exceptions.HTTPError,
|
||||
openai.APIConnectionError,
|
||||
),
|
||||
tries=5,
|
||||
delay=0.5,
|
||||
backoff=0.5,
|
||||
max_delay=2,
|
||||
logger=LOGGER,
|
||||
)
|
||||
def guesser(self, messages):
|
||||
if not self.guesser_model.startswith('gpt'): # hf model
|
||||
self.guesser_model, self.guesser_tokenizer = load_model(self.guesser_model)
|
||||
|
||||
# """Wraps hf's `generate` adding some specific method's defaults"""
|
||||
assert not self.openai_api
|
||||
prompt = self.dialog_history() + ' ASSISTANT:'
|
||||
input_ids = torch.tensor(
|
||||
[self.guesser_tokenizer.encode(prompt, add_special_tokens=True)]
|
||||
) # TODO check if huggingface is using the same format.
|
||||
input_ids = input_ids.to(self.guesser_model.base_model.device)
|
||||
attention_mask = None
|
||||
|
||||
with torch.no_grad():
|
||||
gen = self.guesser_model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
**self.guesser_kargs,
|
||||
)
|
||||
gen_str = (
|
||||
self.guesser_tokenizer.decode(gen[0][input_ids[0].shape[0] :])
|
||||
.split('</s>')[0]
|
||||
.split('USER')[0]
|
||||
.lstrip()
|
||||
.strip()
|
||||
)
|
||||
|
||||
return {
|
||||
'role': 'assistant',
|
||||
'content': gen_str,
|
||||
}
|
||||
else:
|
||||
openai.api_base = self.guesser_api_base
|
||||
client = OpenAI(api_key=openai.api_key)
|
||||
response = client.chat.completions.create(
|
||||
model=self.guesser_model,
|
||||
messages=messages,
|
||||
max_tokens=64,
|
||||
n=1,
|
||||
stop=None,
|
||||
temperature=self.temperature,
|
||||
)
|
||||
return {
|
||||
'role': 'assistant',
|
||||
'content': response.choices[0].message.to_dict()['content'].strip(),
|
||||
}
|
||||
|
||||
def dialog_history(self):
|
||||
history = self.vicuna_prompt + ' '
|
||||
for item in self.guesser_messages:
|
||||
if item['role'].upper() == 'USER':
|
||||
history += 'USER: ' + item['content']
|
||||
elif item['role'].upper() == 'ASSISTANT':
|
||||
history += ' ' + 'ASSISTANT: ' + item['content'] + '</s>'
|
||||
return history
|
||||
|
||||
|
||||
def preprocess_response(self,response):
|
||||
response = re.sub(
|
||||
r'the entity you are thinking of', 'it', response
|
||||
)
|
||||
response = re.sub(
|
||||
r"the entity you're thinking of", 'it', response
|
||||
)
|
||||
response = re.sub(
|
||||
r" you're thinking of", '', response
|
||||
)
|
||||
response = re.sub(
|
||||
r' you are thinking of', '', response
|
||||
)
|
||||
self.guesser_messages.append(response)
|
||||
return response
|
||||
|
||||
def judge_winner(self, response):
|
||||
guesser_question = response.strip()
|
||||
|
||||
if self.curr_turn == self.num_turns - 1:
|
||||
guesser_question += ' Is it right?'
|
||||
# ask for answer
|
||||
usr_msg = self.answerer(guesser_question)
|
||||
|
||||
if 'bingo' in usr_msg['content'].lower():
|
||||
self.guesser_win = True
|
||||
return True, ""
|
||||
|
||||
return False, usr_msg['content'].strip()
|
||||
|
||||
def generate_user_response(self, response):
|
||||
response = self.preprocess_response(response)
|
||||
# others
|
||||
bingo, anwser_reply = self.judge_winner(response)
|
||||
if bingo:
|
||||
return "You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n"
|
||||
if self.curr_turn == self.num_turns - 2:
|
||||
anwser_reply += " You must guess now, what's it?"
|
||||
return anwser_reply
|
||||
|
||||
def game_play(self, user_mode=False):
|
||||
self.reset()
|
||||
# print(f"Item: {self.item}")
|
||||
for t in range(self.num_turns):
|
||||
# System asking a question
|
||||
if (not user_mode) or user_mode is None:
|
||||
guesser_msg = self.guesser(self.guesser_messages)
|
||||
guesser_msg['content'] = re.sub(
|
||||
r'the entity you are thinking of', 'it', guesser_msg['content']
|
||||
)
|
||||
guesser_msg['content'] = re.sub(
|
||||
r"the entity you're thinking of", 'it', guesser_msg['content']
|
||||
)
|
||||
guesser_msg['content'] = re.sub(
|
||||
r" you're thinking of", '', guesser_msg['content']
|
||||
)
|
||||
guesser_msg['content'] = re.sub(
|
||||
r' you are thinking of', '', guesser_msg['content']
|
||||
)
|
||||
else:
|
||||
user_q = input(
|
||||
f'Type in your questions for turn {t+1}. (e.g. Is it a living thing?)\n'
|
||||
)
|
||||
guesser_msg = {'role': 'assistant', 'content': user_q}
|
||||
self.guesser_messages.append(guesser_msg)
|
||||
guesser_question = guesser_msg['content'].strip()
|
||||
|
||||
if t == self.num_turns - 1:
|
||||
self.guesser_messages[-1]['content'] = (
|
||||
self.guesser_messages[-1]['content'] + ' Is it right?'
|
||||
)
|
||||
|
||||
usr_msg = self.answerer(guesser_question)
|
||||
self.guesser_messages.append(
|
||||
{'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
|
||||
)
|
||||
|
||||
if 'bingo' in usr_msg['content'].lower():
|
||||
self.guesser_win = True
|
||||
return True
|
||||
|
||||
if t == self.num_turns - 2:
|
||||
self.guesser_messages[-1]['content'] = (
|
||||
self.guesser_messages[-1]['content']
|
||||
+ " You must guess now, what's it?"
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
def save_session(self, path):
|
||||
# Print the conversation
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
output_file = os.path.join(path, f'{self.item}.txt')
|
||||
with open(output_file, 'w') as out_f:
|
||||
out_f.write(f'item: {self.item}\n')
|
||||
for t, message in enumerate(self.guesser_messages):
|
||||
out_f.write(
|
||||
f"Turn {(t+1)//2}, {message['role'].capitalize()}: {message['content'].lstrip()}\n"
|
||||
)
|
||||
|
||||
def reward(self):
|
||||
if self.guesser_win:
|
||||
n_turns = (len(self.guesser_messages) + 1) // 2
|
||||
return 1 - max(n_turns - 5, 0) * 0.02
|
||||
return 0
|
||||
|
||||
def num_success(self):
|
||||
return 1 if self.guesser_win else 0
|
||||
|
||||
def num_yes(self):
|
||||
n_yes = sum(
|
||||
['yes' in msg['content'].lower() for msg in self.guesser_messages[2::2]]
|
||||
)
|
||||
return n_yes
|
||||
|
||||
@retry(
|
||||
(
|
||||
openai.Timeout,
|
||||
requests.exceptions.ReadTimeout,
|
||||
openai.RateLimitError,
|
||||
openai.APIError,
|
||||
openai.APIConnectionError,
|
||||
),
|
||||
tries=5,
|
||||
delay=0.5,
|
||||
backoff=0.5,
|
||||
max_delay=2,
|
||||
logger=LOGGER,
|
||||
)
|
||||
def answerer(self, question):
|
||||
openai.api_base = self.user_api_base
|
||||
client = OpenAI(api_key=openai.api_key)
|
||||
user_messages = [
|
||||
{
|
||||
'role': 'user',
|
||||
'content': f'Based on your knowledge about {self.item}, '
|
||||
f'respond to the following question or guess. '
|
||||
f"Limit your respond to only 'Yes.', 'No.' or 'Maybe.', with no explanation or other words. "
|
||||
f'Never say the answer {self.item} in your response. '
|
||||
f"If the question is to solicit the answer, respond 'No.'.",
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': f'For the entity {self.item}, {question} (Yes/No/Maybe)',
|
||||
},
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=self.answerer_model,
|
||||
messages=user_messages,
|
||||
max_tokens=6,
|
||||
n=1,
|
||||
stop=None,
|
||||
temperature=0.2,
|
||||
)
|
||||
if any(
|
||||
[
|
||||
re.search(rf'(?:^|\W){i.strip().lower()}(?:$|\W)', question.lower())
|
||||
for i in self.item.lower().split('|')
|
||||
]
|
||||
):
|
||||
response.choices[0].message.content = 'Bingo!'
|
||||
return response.choices[0].message.to_dict()
|
||||
|
||||
def reset(self):
|
||||
# Initialize the conversation
|
||||
self.curr_turn = 0
|
||||
self.guesser_messages = [
|
||||
{
|
||||
'role': 'user',
|
||||
'content': self.first_user_utterance,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class Q20GameCelebrity(Q20Game):
|
||||
def __init__(self, item: str, **kwargs) -> None:
|
||||
super().__init__(item, **kwargs)
|
||||
self.first_user_utterance = (
|
||||
'Your task is to ask a series of questions to deduce the celebrity '
|
||||
"that I'm thinking of with as few queries as possible. "
|
||||
"Only ask factual questions that can be answered by 'Yes.', 'No.' or 'Dunno.'. Do not ask for hint. Make your question brief with no linebreaker. "
|
||||
'Now start asking a question.'
|
||||
)
|
||||
|
||||
@retry(
|
||||
(
|
||||
openai.Timeout,
|
||||
requests.exceptions.ReadTimeout,
|
||||
openai.RateLimitError,
|
||||
openai.APIError,
|
||||
openai.APIConnectionError,
|
||||
),
|
||||
tries=5,
|
||||
delay=0.5,
|
||||
backoff=0.5,
|
||||
max_delay=2,
|
||||
logger=LOGGER,
|
||||
)
|
||||
def answerer(self, question):
|
||||
openai.api_base = self.user_api_base
|
||||
user_messages = [
|
||||
{
|
||||
'role': 'system',
|
||||
'content': f'Based on on your knowledge about the celebrity: {self.item}, '
|
||||
f'respond to the following question or guess. '
|
||||
f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
|
||||
f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
|
||||
f"If the question is to solicit the answer, respond 'No.'.",
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': f'For the celebrity {self.item}, {question}(Yes/No/Dunno)',
|
||||
},
|
||||
]
|
||||
|
||||
response = openai.ChatCompletion.create(
|
||||
model=self.answerer_model,
|
||||
messages=user_messages,
|
||||
max_tokens=6,
|
||||
n=1,
|
||||
stop=None,
|
||||
temperature=0.2,
|
||||
)
|
||||
if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
|
||||
response.choices[0].message.content = 'Bingo!'
|
||||
return response.choices[0].message.to_dict()
|
||||
|
||||
def reset(self):
|
||||
# Initialize the conversation
|
||||
self.guesser_messages = [
|
||||
{
|
||||
'role': 'user',
|
||||
'content': self.first_user_utterance,
|
||||
}
|
||||
]
|
||||
329
evaluation/EDA/run_infer.py
Normal file
329
evaluation/EDA/run_infer.py
Normal file
@@ -0,0 +1,329 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
# import huggingface_hub
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.EDA.game import Q20Game, Q20GameCelebrity
|
||||
|
||||
# from evaluation.EDA.scorer import question_scorer
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
|
||||
game = None
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
global game
|
||||
model_guess = ''
|
||||
if state.history:
|
||||
for act, _ in reversed(state.history):
|
||||
if isinstance(act, MessageAction) and act.source == 'agent':
|
||||
model_guess = act.content
|
||||
break
|
||||
msg = game.generate_user_response(model_guess)
|
||||
game.curr_turn += 1
|
||||
logger.info(f'Model guess: {model_guess}')
|
||||
logger.info(f'Anwser response: {msg}')
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
|
||||
}
|
||||
|
||||
|
||||
def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
|
||||
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
||||
eval_output_dir = metadata['eval_output_dir']
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {instance["text"].strip()}.\nLOG: tail -f {log_file}'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# Prepare instruction
|
||||
_game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
|
||||
|
||||
guesser_kargs = {
|
||||
'max_new_tokens': 64,
|
||||
'temperature': 0.8,
|
||||
'repetition_penalty': 1.0,
|
||||
'do_sample': True,
|
||||
} # no penalty
|
||||
|
||||
# Use codeactagent as guesser_model
|
||||
global game
|
||||
game = _game_class[metadata['dataset']](
|
||||
item=instance['text'].strip(),
|
||||
answerer_model=metadata['answerer_model'],
|
||||
guesser_model=None,
|
||||
num_turns=metadata['max_iterations'],
|
||||
openai_api_key=metadata['openai_api'],
|
||||
guesser_kargs=guesser_kargs,
|
||||
)
|
||||
|
||||
instruction = f'{game.first_user_utterance}'
|
||||
logger.info(f'Instruction: {instruction}')
|
||||
|
||||
# instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
)
|
||||
)
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
# If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
final_message = ''
|
||||
for act, _ in reversed(state.history):
|
||||
if isinstance(act, MessageAction) and act.source == 'agent':
|
||||
final_message = act.content
|
||||
break
|
||||
|
||||
logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
|
||||
test_result = game.reward()
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': instance['text'].strip(),
|
||||
'instance': instance,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': {
|
||||
'success': test_result,
|
||||
'final_message': final_message,
|
||||
'ground_truth': instance['text'],
|
||||
},
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
'--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
default='things',
|
||||
choices=['things', 'celebs'],
|
||||
type=str,
|
||||
help='dataset to be used',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--data-split',
|
||||
default='test',
|
||||
type=str,
|
||||
help='data split, eg, test',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.directory:
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
print(f'Setting workspace base to {config.workspace_base}')
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
eda_dataset = load_dataset(
|
||||
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
|
||||
)
|
||||
logger.info(
|
||||
f'Evaluating Entity Deduction Arena {args.dataset} {args.data_split} split'
|
||||
)
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'eda',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'dataset': args.dataset,
|
||||
'data_split': args.data_split,
|
||||
'answerer_model': args.answerer_model,
|
||||
'agent_class': agent_class,
|
||||
'openai_api': args.OPENAI_API_KEY,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
eda_dataset = eda_dataset.select(list(range(eval_n_limit)))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_items = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_items.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_items)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_eda_dataset = []
|
||||
for instance in eda_dataset:
|
||||
if instance['text'].strip() in finished_items:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["text"].strip()} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_eda_dataset.append(instance)
|
||||
|
||||
eda_dataset = new_eda_dataset
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_items)}, Remaining instances: {len(eda_dataset)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(eda_dataset))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in eda_dataset:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
49
evaluation/EDA/scripts/run_infer.sh
Executable file
49
evaluation/EDA/scripts/run_infer.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
MODEL_CONFIG=$1
|
||||
AGENT=$2
|
||||
DATASET=$3
|
||||
EVAL_LIMIT=$4
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "Dataset not specified, use default 'things'"
|
||||
DATASET="things"
|
||||
fi
|
||||
|
||||
# check if OPENAI_API_KEY is set
|
||||
if [ -z "$OPENAI_API_KEY" ]; then
|
||||
echo "OPENAI_API_KEY is not set, please set it to run the script"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
|
||||
COMMAND="poetry run python evaluation/EDA/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--dataset $DATASET \
|
||||
--data-split test \
|
||||
--max-iterations 20 \
|
||||
--OPENAI_API_KEY $OPENAI_API_KEY \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers 1 \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
@@ -15,6 +15,7 @@ all the preprocessing/evaluation/analysis scripts.
|
||||
- SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
|
||||
- HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
|
||||
- GAIA: [`evaluation/gaia`](./gaia)
|
||||
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
|
||||
|
||||
### Result Visualization
|
||||
|
||||
|
||||
34
poetry.lock
generated
34
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aenum"
|
||||
@@ -1169,6 +1169,17 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch
|
||||
torch = ["torch"]
|
||||
vision = ["Pillow (>=6.2.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.1.1"
|
||||
description = "Decorators for Humans"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
{file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
|
||||
{file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deprecated"
|
||||
version = "1.2.14"
|
||||
@@ -3093,13 +3104,9 @@ files = [
|
||||
{file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
|
||||
{file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
|
||||
@@ -5497,6 +5504,21 @@ requests = ">=2.0.0"
|
||||
[package.extras]
|
||||
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "retry"
|
||||
version = "0.9.2"
|
||||
description = "Easy to use retry decorator."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"},
|
||||
{file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
decorator = ">=3.4.2"
|
||||
py = ">=1.4.26,<2.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "13.7.1"
|
||||
@@ -7526,4 +7548,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "141771396f59fc23d52623ada07e4b89272ca781e5a2072f98ebccdf3f18a43b"
|
||||
content-hash = "70be72e8064824ea756bf2543c8588e266a980e0e6dbc1fc50eecfb365c707d9"
|
||||
|
||||
@@ -69,6 +69,7 @@ concurrency = ["gevent"]
|
||||
[tool.poetry.group.evaluation.dependencies]
|
||||
streamlit = "*"
|
||||
whatthepatch = "*"
|
||||
retry = "*"
|
||||
evaluate = "*"
|
||||
|
||||
[build-system]
|
||||
|
||||
Reference in New Issue
Block a user