Fix issue #5222: [Refactor]: Refactor the evaluation directory (#5223)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
OpenHands
2024-11-25 08:35:52 -05:00
committed by GitHub
parent 1725627c7d
commit 678436da30
152 changed files with 147 additions and 143 deletions

View File

@@ -0,0 +1,16 @@
from evaluation.benchmarks.mint.tasks.base import Task
from evaluation.benchmarks.mint.tasks.codegen import HumanEvalTask, MBPPTask
from evaluation.benchmarks.mint.tasks.reasoning import (
MultipleChoiceTask,
ReasoningTask,
TheoremqaTask,
)
__all__ = [
'Task',
'MultipleChoiceTask',
'ReasoningTask',
'TheoremqaTask',
'MBPPTask',
'HumanEvalTask',
]

View File

@@ -0,0 +1,90 @@
import json
import logging
import os
from abc import ABC, abstractmethod
from utils import load_file
LOGGER = logging.getLogger('MINT')
class Task(ABC):
"""Base class for a task instance."""
task_name: str = 'base'
in_context_example_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'in_context_examples',
)
def __init__(self, **kwargs) -> None:
if 'loaded_history' in kwargs:
self.loaded_history = kwargs['loaded_history']
else:
self.loaded_history = None
# pre-load the in-context example
task_dir = os.path.join(self.in_context_example_dir, self.task_name)
self._in_context_example = {
'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
}
self.metadata = {}
@property
def task_id(self) -> str:
"""Return the task id."""
assert hasattr(self, '_id'), 'Task does not have an id.'
return self._id
def in_context_example(
self, use_tool: bool = True, with_feedback: bool = False
) -> str:
"""Return the in-context example for the task."""
if use_tool and not with_feedback:
return self._in_context_example['with_tool']
else:
raise NotImplementedError
@property
def prompt(self) -> str:
"""Return the task prompt."""
assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
return self._prompt
@property
def reference(self) -> str:
"""Return the reference solution for the task."""
assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
return self._reference
@abstractmethod
def extract_answer(self, solution: str) -> str | None:
"""Extract the answer from the given solution."""
pass
@abstractmethod
def success(self, solution: str) -> bool:
"""This checks whether the given solution can complete the current task.
Can be used to provide binary feedback.
"""
answer = self.extract_answer(solution)
return answer == self.reference
@classmethod
def load_tasks(cls, path: str) -> tuple[list['Task'], int]:
"""Load all the tasks from a given jsonl file."""
assert path.endswith('.jsonl') or path.endswith('.json')
with open(path, 'r') as f:
tasks = [cls(**json.loads(line)) for line in f.readlines()]
LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
return tasks, len(tasks)
def to_dict(self) -> dict:
"""Convert the task to a dictionary."""
return {
'task_name': self.task_name,
'task_id': self.task_id,
'prompt': self.prompt,
'reference': self.reference,
'metadata': self.metadata,
}

View File

@@ -0,0 +1,81 @@
import logging
from utils import check_correctness
from evaluation.benchmarks.mint.tasks.base import Task
LOGGER = logging.getLogger('MINT')
class CodeGenTask(Task):
"""Generic code generation task instance."""
def __init__(self, id: str, prompt: str, reference: str, **kwargs):
super().__init__(**kwargs)
self._id = id
self._prompt = prompt
self._reference = reference
def success(self, solution: str) -> bool:
"""This checks whether the given solution can complete the current task.
Can be used to provides binary feedback.
"""
code_to_exec = self.extract_answer(solution)
LOGGER.debug(f'CODE_TO_EXEC:\n{code_to_exec}')
LOGGER.debug(f'TEST_CODE:\n{self._reference}')
res = check_correctness(
solution_code=code_to_exec, test_code=self._reference, timeout=10
)
return res['success']
class MBPPTask(CodeGenTask):
task_name = 'mbpp'
@property
def prompt(self) -> str:
"""Return the prompt for this task.
MBPP prompt contains \"\"\" enclosed at both ends. Need to remove it.
"""
return self._prompt.replace('"""', '').strip()
def extract_answer(self, solution: str) -> str | None:
"""Extract the answer from the given solution.
Split off first block of code by scanning for class, def etc. on newlines.
Modified from:
https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/mbpp.py#L67
"""
# STOP_WORDS = ["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/"]
# return re.split("|".join(STOP_WORDS), solution)[0].rstrip()
return solution
class HumanEvalTask(CodeGenTask):
task_name = 'humaneval'
@property
def prompt(self) -> str:
"""Return the prompt for this task.
MBPP prompt contains \"\"\" enclosed at both ends. Need to remove it.
"""
return 'Complete the following code:\n\n' + self._prompt
def extract_answer(self, solution: str) -> str | None:
"""Extract the answer from the given solution.
Split off first block of code by scanning for class, def etc. on newlines.
Modified from:
https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/humaneval.py#L56
"""
# STOP_WORDS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
# # Remove the last block of the code containing stop_words for HumanEval
# string_list = re.split("(%s)" % "|".join(STOP_WORDS), solution)
# # last string should be ""
# return "".join(string_list[:-2])
return solution

View File

@@ -0,0 +1,51 @@
Task:
Complete the following code:
from typing import Tuple
def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
"""
Write a function to find the similar elements from the given two tuple lists.
assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
"""
Assistant:
<thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
<execute>
from typing import Tuple
def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
res = tuple(set(test_tup1) | set(test_tup2))
return res
res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
</execute>
Observation:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[4], line 5
3 return res
4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
Assistant:
<thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
<execute>
def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
res = tuple(set(test_tup1) & set(test_tup2))
return res
res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
</execute>
Observation:
[Executed Successfully with No Output]
Assistant:
<thought> There is no more AssertionError. I can now submit the solution. </thought>
<solution>
def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
res = tuple(set(test_tup1) & set(test_tup2))
return res
</solution>

View File

@@ -0,0 +1,44 @@
Task:
Write a function to find the similar elements from the given two tuple lists.
assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
Assistant:
<thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
<execute>
def similar_elements(test_tup1, test_tup2):
res = tuple(set(test_tup1) | set(test_tup2))
return res
res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
</execute>
Observation:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[4], line 5
3 return res
4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
Assistant:
<thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
<execute>
def similar_elements(test_tup1, test_tup2):
res = tuple(set(test_tup1) & set(test_tup2))
return res
res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
</execute>
Observation:
[Executed Successfully with No Output]
Assistant:
<thought> There is no more AssertionError. I can now submit the solution. </thought>
<solution>
def similar_elements(test_tup1, test_tup2):
res = tuple(set(test_tup1) & set(test_tup2))
return res
</solution>

View File

@@ -0,0 +1,48 @@
Task:
Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai.
Output format requirement: Round to three decimal places, using half up rounding rule.
Assistant:
<thought> To solve this problem, let's start by determining Li Bai's birth year. </thought>
<execute_ipython> print(wikipedia_search('Li Bai')) </execute_ipython>
Observation:
Page: Li Bai
Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights.
Assistant:
<thought>
From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701.
To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates.
In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$.
$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$
Let's use the SymPy package to calculate this.
</thought>
<execute_ipython>
from sympy import symbols, integrate, cos, sin, pi
r, theta = symbols('r theta')
# the integrand
integrand = (r*cos(theta) + r*sin(theta))**2 * r
# perform the double integration
double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi))
print(double_integral)
</execute_ipython>
Observation:
Out[2]: 245700.5*pi
Assistant:
<thought> Let's round this result to three decimal places using half up rounding rule. </thought>
<execute_ipython>
import math
print(round(245700.5*math.pi, 3))
</execute_ipython>
Observation:
771890.886
Assistant:
The answer is <solution> 771890.886 </solution>.

View File

@@ -0,0 +1,355 @@
import ast
import logging
import re
import traceback
from typing import Any
import numpy as np
from sympy import Rational
from tasks.base import Task
LOGGER = logging.getLogger('MINT')
class ReasoningTask(Task):
task_name = 'reasoning'
def __init__(self, id: str, prompt: str, reference: str, **kwargs):
super().__init__(**kwargs)
self._id = id
self._prompt = prompt.strip()
self._reference = str(reference).strip().lower()
def extract_answer(self, solution: str) -> str | None:
"""Extract the answer from the given solution."""
return solution.lower().strip()
def compare_w_digits(self, reference: str, answer: str) -> bool:
"""Compare the reference and answer with digits."""
# if reference can and answer can both be converted to floats by float()
try:
float(reference)
float(answer)
return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
except ValueError:
return reference in answer
except Exception:
raise ValueError(f'Cannot compare {reference} and {answer}')
def success(self, solution: str) -> bool:
answer = self.extract_answer(solution)
return self.compare_w_digits(self._reference, answer)
class MultipleChoiceTask(Task):
"""Subclass of Task for multiple choice tasks."""
task_name = 'reasoning'
def __init__(self, id, prompt: str, reference: str, **kwargs):
super().__init__(**kwargs)
self._id = id
self.hide_options = kwargs.get('hide_options', False)
if self.hide_options:
self._prompt = prompt.split('Options:')[0].strip()
else:
self._prompt = prompt
self._reference = reference.strip().lower()
self._options = self.extract_options(prompt)
# if all options can be converted to float, strictly perform hide options
try:
for option in self._options.values():
float(option)
self.hide_options = True
except ValueError:
pass
self.metadata.update({'options': self._options})
def extract_answer(self, solution: str) -> str | None:
# Extract the selected option from the solution
solution = solution.lower().strip()
for letter in 'abcdefghijklmnopqrstuvwxyz':
if f'{letter})' in solution or f'{letter} )' in solution:
print('SOLUTION', letter)
return letter
else:
print('SOLUTION', solution)
return solution
def compare_w_digits(self, reference: str, answer: str) -> bool:
if reference.isdigit() and answer.isdigit():
return abs(float(reference) - float(answer)) <= 0.05 * float(reference)
else:
return reference in answer
def success(self, solution: str) -> bool:
answer = self.extract_answer(solution)
if self.compare_w_digits(self._reference, answer):
return True
else:
correct_option = self._options[self._reference]
wrong_option_list = list(self._options.values())
print('OPTIONS', correct_option, wrong_option_list)
print('ANSWER', answer)
for i in wrong_option_list:
if i in correct_option:
wrong_option_list.remove(i)
for i in wrong_option_list:
if self.compare_w_digits(i, answer) or (i in answer):
return False
if self.compare_w_digits(correct_option, answer) or (
correct_option in answer
):
return True
else:
return False
def extract_options(self, prompt: str) -> dict:
# Find the possible option separators (comma, semicolon, or parentheses)
prompt = prompt.split('Options: ')[-1]
# Extract the options using the delimiter
options_match = prompt.split(' , ')
options = {}
for i in range(len(options_match)):
option = options_match[i].strip("[]' ")
option = option.split(')')
letter = option[0].lower().strip()
content = (
option[1]
.lower()
.strip('.')
.replace('. Which option is correct?', '')
.replace('. Which one is correct?', '')
.strip()
)
options.update({letter: content})
return options
# ==== TheoremQA ====
def compare_two_numbers(p, gt):
if isinstance(p, (int, float)):
pass
elif isinstance(p, (bool, complex, dict, list, str, tuple)):
return False
else:
raise ValueError(p)
if isinstance(gt, float):
return within_eps(pred=p, gt=gt)
else:
return round(p) == gt
def compare_two_list(pred, gt):
if not isinstance(pred, list):
return False
elif len(pred) != len(gt):
return False
elif any([not isinstance(x, (int, float)) for x in pred]):
return False
else:
pred = sorted(pred)
gt = sorted(gt)
return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)])
def within_eps(pred: float, gt: float):
eps = abs(gt) * 0.04
if pred >= gt - eps and pred <= gt + eps:
return True
else:
return False
def parse_number_list(s: str):
# Check if the string is a valid list by trying to parse it
parsed_list = ast.literal_eval(s)
return parsed_list
def is_number(string):
pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
match = re.match(pattern, string)
return bool(match)
def is_scientific_number(string):
pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
match = re.match(pattern, string)
return bool(match)
def contain_num_and_str(string):
pattern_str = r'[a-zA-Z]'
pattern_num = r'[0-9]'
return bool(re.search(pattern_str, string) and re.search(pattern_num, string))
class TheoremqaTask(Task):
task_name = 'reasoning'
def __init__(self, id: str, prompt: str, reference: str, **kwargs):
super().__init__(**kwargs)
self._id = id
self._prompt = (
'Answer the following question with a number, a list of numbers or True or False. '
+ prompt.strip()
)
self._reference = reference
self._answer_type = kwargs.get('answer_type')
def extract_answer(self, solution: str) -> Any:
"""Extract the answer from the given solution."""
prediction = solution
# Following the preprocessing steps from TheoremQA
# https://github.com/wenhuchen/TheoremQA/blob/123e36beaaa97c01f28a582f13c4f77a6822c199/predict_accuracy.py#L170
# Preprocessing the string [Stage 1]
if not isinstance(prediction, str):
prediction = str(prediction) if prediction is not None else '0'
# Replace special tokens
if '=' in prediction:
prediction = prediction.split('=')[-1].strip()
if '' in prediction:
prediction = prediction.split('')[-1].strip()
if '`' in prediction:
prediction = prediction.replace('`', '')
if '$' in prediction:
prediction = prediction.replace('$', '')
if '°' in prediction:
prediction = prediction.replace('°', '')
# Detect the boolean keyword in the generation
if prediction in ('true', 'yes', 'false', 'no'):
if prediction in ('true', 'yes'):
prediction = 'True'
else:
prediction = 'False'
if 'True' in prediction or 'False' in prediction:
prediction = 'True' if 'True' in prediction else 'False'
# Detect the approximation keyword
if 'approximately' in prediction:
prediction = prediction.replace('approximately', '').strip()
if ' or ' in prediction:
prediction = prediction.split(' or ')[0]
# Drop the units before and after the number
if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
prediction = re.search(
r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction
).group(1)
if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
prediction = re.search(
r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction
).group(1)
if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
prediction = re.search(
r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction
).group(1)
if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
prediction = re.search(
r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction
).group(1)
# Preprocessing the number [Stage 1]
if '10^' in prediction:
prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
if ' x ' in prediction:
prediction = prediction.replace(' x ', '*')
if ' × ' in prediction:
prediction = prediction.replace(' × ', '*')
if is_number(prediction):
prediction = prediction.replace(',', '')
# Preprocessing the option [Stage 3]
if (
'a)' in prediction
or 'a )' in prediction
or prediction.lower().strip() == 'a'
):
prediction = '(a)'
if (
'b)' in prediction
or 'b )' in prediction
or prediction.lower().strip() == 'b'
):
prediction = '(b)'
if (
'c)' in prediction
or 'c )' in prediction
or prediction.lower().strip() == 'c'
):
prediction = '(c)'
if (
'd)' in prediction
or 'd )' in prediction
or prediction.lower().strip() == 'd'
):
prediction = '(d)'
if (
'(a)' in prediction
or '(b)' in prediction
or '(c)' in prediction
or '(d)' in prediction
):
prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
# If the prediction is empty, use dummy '0'
if not prediction:
prediction = '0'
# Converting the string answer to a number/list/bool/option
try:
prediction = eval(prediction)
except Exception:
LOGGER.warning(
f'[TASK] Failed to convert the answer: {prediction}\n{traceback.format_exc()}'
)
return None # failed to convert the answer
# Performing common type conversion
if isinstance(prediction, (set, tuple)):
prediction = list(prediction)
if isinstance(prediction[0], complex):
prediction = [tmp.real for tmp in prediction]
elif isinstance(prediction[0], Rational):
prediction = [float(tmp) for tmp in prediction]
elif isinstance(prediction, np.ndarray):
prediction = prediction.tolist()
else:
if isinstance(prediction, complex):
prediction = prediction.real
elif isinstance(prediction, Rational):
prediction = float(prediction)
return prediction
def success(self, solution: str) -> bool:
"""This checks whether the given solution can complete the current task."""
# Follow the implementation from TheoremQA
# https://github.com/wenhuchen/TheoremQA/blob/123e36beaaa97c01f28a582f13c4f77a6822c199/predict_accuracy.py#L301C9-L317C1
prediction = self.extract_answer(solution)
LOGGER.info(f'TheoremQA Parsed Prediction: {prediction}')
answer_type = self._answer_type
gt = self.extract_answer(self.reference)
if isinstance(prediction, (str, int, float, list)):
# Comparing prediction against the reference
if answer_type in ['bool', 'option', 'Option']:
cur_correct = int(prediction == f'({gt})') or int(prediction == gt)
elif answer_type == 'integer':
cur_correct = int(compare_two_numbers(prediction, gt))
elif answer_type == 'float':
cur_correct = int(compare_two_numbers(prediction, gt))
elif answer_type in ['list of integer', 'list of float']:
cur_correct = int(compare_two_list(prediction, gt))
else:
cur_correct = 0
return bool(cur_correct)