Fix issue #5222: [Refactor]: Refactor the evaluation directory (#5223)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2026-01-09 14:57:59 -05:00 · 2024-11-25 08:35:52 -05:00
parent 1725627c7d
commit 678436da30
152 changed files with 147 additions and 143 deletions
--- a/evaluation/benchmarks/mint/tasks/init.py
+++ b/evaluation/benchmarks/mint/tasks/init.py
@@ -0,0 +1,16 @@
+from evaluation.benchmarks.mint.tasks.base import Task
+from evaluation.benchmarks.mint.tasks.codegen import HumanEvalTask, MBPPTask
+from evaluation.benchmarks.mint.tasks.reasoning import (
+    MultipleChoiceTask,
+    ReasoningTask,
+    TheoremqaTask,
+)
+
+__all__ = [
+    'Task',
+    'MultipleChoiceTask',
+    'ReasoningTask',
+    'TheoremqaTask',
+    'MBPPTask',
+    'HumanEvalTask',
+]
--- a/evaluation/benchmarks/mint/tasks/base.py
+++ b/evaluation/benchmarks/mint/tasks/base.py
@@ -0,0 +1,90 @@
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+
+from utils import load_file
+
+LOGGER = logging.getLogger('MINT')
+
+
+class Task(ABC):
+    """Base class for a task instance."""
+
+    task_name: str = 'base'
+    in_context_example_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'in_context_examples',
+    )
+
+    def __init__(self, **kwargs) -> None:
+        if 'loaded_history' in kwargs:
+            self.loaded_history = kwargs['loaded_history']
+        else:
+            self.loaded_history = None
+        # pre-load the in-context example
+        task_dir = os.path.join(self.in_context_example_dir, self.task_name)
+        self._in_context_example = {
+            'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
+        }
+        self.metadata = {}
+
+    @property
+    def task_id(self) -> str:
+        """Return the task id."""
+        assert hasattr(self, '_id'), 'Task does not have an id.'
+        return self._id
+
+    def in_context_example(
+        self, use_tool: bool = True, with_feedback: bool = False
+    ) -> str:
+        """Return the in-context example for the task."""
+        if use_tool and not with_feedback:
+            return self._in_context_example['with_tool']
+        else:
+            raise NotImplementedError
+
+    @property
+    def prompt(self) -> str:
+        """Return the task prompt."""
+        assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
+        return self._prompt
+
+    @property
+    def reference(self) -> str:
+        """Return the reference solution for the task."""
+        assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
+        return self._reference
+
+    @abstractmethod
+    def extract_answer(self, solution: str) -> str | None:
+        """Extract the answer from the given solution."""
+        pass
+
+    @abstractmethod
+    def success(self, solution: str) -> bool:
+        """This checks whether the given solution can complete the current task.
+
+        Can be used to provide binary feedback.
+        """
+        answer = self.extract_answer(solution)
+        return answer == self.reference
+
+    @classmethod
+    def load_tasks(cls, path: str) -> tuple[list['Task'], int]:
+        """Load all the tasks from a given jsonl file."""
+        assert path.endswith('.jsonl') or path.endswith('.json')
+        with open(path, 'r') as f:
+            tasks = [cls(**json.loads(line)) for line in f.readlines()]
+        LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
+        return tasks, len(tasks)
+
+    def to_dict(self) -> dict:
+        """Convert the task to a dictionary."""
+        return {
+            'task_name': self.task_name,
+            'task_id': self.task_id,
+            'prompt': self.prompt,
+            'reference': self.reference,
+            'metadata': self.metadata,
+        }
--- a/evaluation/benchmarks/mint/tasks/codegen.py
+++ b/evaluation/benchmarks/mint/tasks/codegen.py
@@ -0,0 +1,81 @@
+import logging
+
+from utils import check_correctness
+
+from evaluation.benchmarks.mint.tasks.base import Task
+
+LOGGER = logging.getLogger('MINT')
+
+
+class CodeGenTask(Task):
+    """Generic code generation task instance."""
+
+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self._prompt = prompt
+        self._reference = reference
+
+    def success(self, solution: str) -> bool:
+        """This checks whether the given solution can complete the current task.
+
+        Can be used to provides binary feedback.
+        """
+        code_to_exec = self.extract_answer(solution)
+        LOGGER.debug(f'CODE_TO_EXEC:\n{code_to_exec}')
+        LOGGER.debug(f'TEST_CODE:\n{self._reference}')
+        res = check_correctness(
+            solution_code=code_to_exec, test_code=self._reference, timeout=10
+        )
+        return res['success']
+
+
+class MBPPTask(CodeGenTask):
+    task_name = 'mbpp'
+
+    @property
+    def prompt(self) -> str:
+        """Return the prompt for this task.
+
+        MBPP prompt contains \"\"\" enclosed at both ends. Need to remove it.
+        """
+        return self._prompt.replace('"""', '').strip()
+
+    def extract_answer(self, solution: str) -> str | None:
+        """Extract the answer from the given solution.
+
+        Split off first block of code by scanning for class, def etc. on newlines.
+
+        Modified from:
+        https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/mbpp.py#L67
+        """
+        # STOP_WORDS = ["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/"]
+        # return re.split("|".join(STOP_WORDS), solution)[0].rstrip()
+        return solution
+
+
+class HumanEvalTask(CodeGenTask):
+    task_name = 'humaneval'
+
+    @property
+    def prompt(self) -> str:
+        """Return the prompt for this task.
+
+        MBPP prompt contains \"\"\" enclosed at both ends. Need to remove it.
+        """
+        return 'Complete the following code:\n\n' + self._prompt
+
+    def extract_answer(self, solution: str) -> str | None:
+        """Extract the answer from the given solution.
+
+        Split off first block of code by scanning for class, def etc. on newlines.
+
+        Modified from:
+        https://github.com/bigcode-project/bigcode-evaluation-harness/blob/d61afde130005ecc65cf800ad8eca790a9bc2115/lm_eval/tasks/humaneval.py#L56
+        """
+        # STOP_WORDS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
+        # # Remove the last block of the code containing stop_words for HumanEval
+        # string_list = re.split("(%s)" % "|".join(STOP_WORDS), solution)
+        # # last string should be ""
+        # return "".join(string_list[:-2])
+        return solution
--- a/evaluation/benchmarks/mint/tasks/in_context_examples/humaneval/with_tool.txt
+++ b/evaluation/benchmarks/mint/tasks/in_context_examples/humaneval/with_tool.txt
@@ -0,0 +1,51 @@
+Task:
+Complete the following code:
+
+from typing import Tuple
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    """
+    Write a function to find the similar elements from the given two tuple lists.
+    assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+    """
+
+Assistant:
+<thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
+<execute>
+from typing import Tuple
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    res = tuple(set(test_tup1) | set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+Cell In[4], line 5
+      3     return res
+      4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+
+AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
+
+Assistant:
+<thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
+<execute>
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+[Executed Successfully with No Output]
+
+Assistant:
+<thought> There is no more AssertionError. I can now submit the solution. </thought>
+<solution>
+def similar_elements(test_tup1: Tuple[int], test_tup2: Tuple[int]) -> Tuple[int]:
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+</solution>
--- a/evaluation/benchmarks/mint/tasks/in_context_examples/mbpp/with_tool.txt
+++ b/evaluation/benchmarks/mint/tasks/in_context_examples/mbpp/with_tool.txt
@@ -0,0 +1,44 @@
+Task:
+Write a function to find the similar elements from the given two tuple lists.
+assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+
+Assistant:
+<thought> The similar elements here are just elements that are in both lists. I should run a solution on the test case before proposing a solution. </thought>
+<execute>
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) | set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+Cell In[4], line 5
+      3     return res
+      4 res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+----> 5 assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+
+AssertionError: Expected (4, 5) but got (3, 4, 5, 6, 7, 10)
+
+Assistant:
+<thought> To find the similar elements, I should find the intersection of the two sets. I should use the & operator instead of the | operator. </thought>
+<execute>
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+res = similar_elements((3, 4, 5, 6), (5, 7, 4, 10))
+assert res == (4, 5), "Expected (4, 5) but got {}".format(res)
+</execute>
+
+Observation:
+[Executed Successfully with No Output]
+
+Assistant:
+<thought> There is no more AssertionError. I can now submit the solution. </thought>
+<solution>
+def similar_elements(test_tup1, test_tup2):
+    res = tuple(set(test_tup1) & set(test_tup2))
+    return res
+</solution>
--- a/evaluation/benchmarks/mint/tasks/in_context_examples/reasoning/with_tool.txt
+++ b/evaluation/benchmarks/mint/tasks/in_context_examples/reasoning/with_tool.txt
@@ -0,0 +1,48 @@
+Task:
+Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai.
+Output format requirement: Round to three decimal places, using half up rounding rule.
+
+Assistant:
+<thought> To solve this problem, let's start by determining Li Bai's birth year. </thought>
+<execute_ipython> print(wikipedia_search('Li Bai')) </execute_ipython>
+
+Observation:
+Page: Li Bai
+Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701–762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights.
+
+Assistant:
+<thought>
+From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701.
+To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates.
+In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$.
+$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$
+Let's use the SymPy package to calculate this.
+</thought>
+<execute_ipython>
+from sympy import symbols, integrate, cos, sin, pi
+
+r, theta = symbols('r theta')
+
+# the integrand
+integrand = (r*cos(theta) + r*sin(theta))**2 * r
+
+# perform the double integration
+double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi))
+print(double_integral)
+</execute_ipython>
+
+Observation:
+Out[2]: 245700.5*pi
+
+Assistant:
+<thought> Let's round this result to three decimal places using half up rounding rule. </thought>
+<execute_ipython>
+import math
+print(round(245700.5*math.pi, 3))
+</execute_ipython>
+
+Observation:
+771890.886
+
+Assistant:
+The answer is <solution> 771890.886 </solution>.
--- a/evaluation/benchmarks/mint/tasks/reasoning.py
+++ b/evaluation/benchmarks/mint/tasks/reasoning.py
@@ -0,0 +1,355 @@
+import ast
+import logging
+import re
+import traceback
+from typing import Any
+
+import numpy as np
+from sympy import Rational
+
+from tasks.base import Task
+
+LOGGER = logging.getLogger('MINT')
+
+
+class ReasoningTask(Task):
+    task_name = 'reasoning'
+
+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self._prompt = prompt.strip()
+        self._reference = str(reference).strip().lower()
+
+    def extract_answer(self, solution: str) -> str | None:
+        """Extract the answer from the given solution."""
+        return solution.lower().strip()
+
+    def compare_w_digits(self, reference: str, answer: str) -> bool:
+        """Compare the reference and answer with digits."""
+        # if reference can and answer can both be converted to floats by float()
+        try:
+            float(reference)
+            float(answer)
+            return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
+        except ValueError:
+            return reference in answer
+        except Exception:
+            raise ValueError(f'Cannot compare {reference} and {answer}')
+
+    def success(self, solution: str) -> bool:
+        answer = self.extract_answer(solution)
+        return self.compare_w_digits(self._reference, answer)
+
+
+class MultipleChoiceTask(Task):
+    """Subclass of Task for multiple choice tasks."""
+
+    task_name = 'reasoning'
+
+    def __init__(self, id, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self.hide_options = kwargs.get('hide_options', False)
+        if self.hide_options:
+            self._prompt = prompt.split('Options:')[0].strip()
+        else:
+            self._prompt = prompt
+        self._reference = reference.strip().lower()
+        self._options = self.extract_options(prompt)
+        # if all options can be converted to float, strictly perform hide options
+        try:
+            for option in self._options.values():
+                float(option)
+            self.hide_options = True
+        except ValueError:
+            pass
+        self.metadata.update({'options': self._options})
+
+    def extract_answer(self, solution: str) -> str | None:
+        # Extract the selected option from the solution
+        solution = solution.lower().strip()
+        for letter in 'abcdefghijklmnopqrstuvwxyz':
+            if f'{letter})' in solution or f'{letter} )' in solution:
+                print('SOLUTION', letter)
+                return letter
+            else:
+                print('SOLUTION', solution)
+                return solution
+
+    def compare_w_digits(self, reference: str, answer: str) -> bool:
+        if reference.isdigit() and answer.isdigit():
+            return abs(float(reference) - float(answer)) <= 0.05 * float(reference)
+        else:
+            return reference in answer
+
+    def success(self, solution: str) -> bool:
+        answer = self.extract_answer(solution)
+        if self.compare_w_digits(self._reference, answer):
+            return True
+        else:
+            correct_option = self._options[self._reference]
+            wrong_option_list = list(self._options.values())
+            print('OPTIONS', correct_option, wrong_option_list)
+            print('ANSWER', answer)
+            for i in wrong_option_list:
+                if i in correct_option:
+                    wrong_option_list.remove(i)
+            for i in wrong_option_list:
+                if self.compare_w_digits(i, answer) or (i in answer):
+                    return False
+            if self.compare_w_digits(correct_option, answer) or (
+                correct_option in answer
+            ):
+                return True
+            else:
+                return False
+
+    def extract_options(self, prompt: str) -> dict:
+        # Find the possible option separators (comma, semicolon, or parentheses)
+        prompt = prompt.split('Options: ')[-1]
+        # Extract the options using the delimiter
+        options_match = prompt.split(' , ')
+        options = {}
+        for i in range(len(options_match)):
+            option = options_match[i].strip("[]' ")
+            option = option.split(')')
+            letter = option[0].lower().strip()
+            content = (
+                option[1]
+                .lower()
+                .strip('.')
+                .replace('. Which option is correct?', '')
+                .replace('. Which one is correct?', '')
+                .strip()
+            )
+            options.update({letter: content})
+        return options
+
+
+# ==== TheoremQA ====
+
+
+def compare_two_numbers(p, gt):
+    if isinstance(p, (int, float)):
+        pass
+    elif isinstance(p, (bool, complex, dict, list, str, tuple)):
+        return False
+    else:
+        raise ValueError(p)
+
+    if isinstance(gt, float):
+        return within_eps(pred=p, gt=gt)
+    else:
+        return round(p) == gt
+
+
+def compare_two_list(pred, gt):
+    if not isinstance(pred, list):
+        return False
+    elif len(pred) != len(gt):
+        return False
+    elif any([not isinstance(x, (int, float)) for x in pred]):
+        return False
+    else:
+        pred = sorted(pred)
+        gt = sorted(gt)
+        return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)])
+
+
+def within_eps(pred: float, gt: float):
+    eps = abs(gt) * 0.04
+    if pred >= gt - eps and pred <= gt + eps:
+        return True
+    else:
+        return False
+
+
+def parse_number_list(s: str):
+    # Check if the string is a valid list by trying to parse it
+    parsed_list = ast.literal_eval(s)
+    return parsed_list
+
+
+def is_number(string):
+    pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
+    match = re.match(pattern, string)
+    return bool(match)
+
+
+def is_scientific_number(string):
+    pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
+    match = re.match(pattern, string)
+    return bool(match)
+
+
+def contain_num_and_str(string):
+    pattern_str = r'[a-zA-Z]'
+    pattern_num = r'[0-9]'
+    return bool(re.search(pattern_str, string) and re.search(pattern_num, string))
+
+
+class TheoremqaTask(Task):
+    task_name = 'reasoning'
+
+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
+        super().__init__(**kwargs)
+        self._id = id
+        self._prompt = (
+            'Answer the following question with a number, a list of numbers or True or False. '
+            + prompt.strip()
+        )
+        self._reference = reference
+        self._answer_type = kwargs.get('answer_type')
+
+    def extract_answer(self, solution: str) -> Any:
+        """Extract the answer from the given solution."""
+        prediction = solution
+        # Following the preprocessing steps from TheoremQA
+        # https://github.com/wenhuchen/TheoremQA/blob/123e36beaaa97c01f28a582f13c4f77a6822c199/predict_accuracy.py#L170
+
+        # Preprocessing the string [Stage 1]
+        if not isinstance(prediction, str):
+            prediction = str(prediction) if prediction is not None else '0'
+
+        # Replace special tokens
+        if '=' in prediction:
+            prediction = prediction.split('=')[-1].strip()
+        if '≈' in prediction:
+            prediction = prediction.split('≈')[-1].strip()
+        if '`' in prediction:
+            prediction = prediction.replace('`', '')
+        if '$' in prediction:
+            prediction = prediction.replace('$', '')
+        if '°' in prediction:
+            prediction = prediction.replace('°', '')
+
+        # Detect the boolean keyword in the generation
+        if prediction in ('true', 'yes', 'false', 'no'):
+            if prediction in ('true', 'yes'):
+                prediction = 'True'
+            else:
+                prediction = 'False'
+        if 'True' in prediction or 'False' in prediction:
+            prediction = 'True' if 'True' in prediction else 'False'
+
+        # Detect the approximation keyword
+        if 'approximately' in prediction:
+            prediction = prediction.replace('approximately', '').strip()
+        if ' or ' in prediction:
+            prediction = prediction.split(' or ')[0]
+
+        # Drop the units before and after the number
+        if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
+            prediction = re.search(
+                r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction
+            ).group(1)
+        if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
+            prediction = re.search(
+                r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction
+            ).group(1)
+        if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
+            prediction = re.search(
+                r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction
+            ).group(1)
+        if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
+            prediction = re.search(
+                r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction
+            ).group(1)
+
+        # Preprocessing the number [Stage 1]
+        if '10^' in prediction:
+            prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
+        if ' x ' in prediction:
+            prediction = prediction.replace(' x ', '*')
+        if ' × ' in prediction:
+            prediction = prediction.replace(' × ', '*')
+        if is_number(prediction):
+            prediction = prediction.replace(',', '')
+
+        # Preprocessing the option [Stage 3]
+        if (
+            'a)' in prediction
+            or 'a )' in prediction
+            or prediction.lower().strip() == 'a'
+        ):
+            prediction = '(a)'
+        if (
+            'b)' in prediction
+            or 'b )' in prediction
+            or prediction.lower().strip() == 'b'
+        ):
+            prediction = '(b)'
+        if (
+            'c)' in prediction
+            or 'c )' in prediction
+            or prediction.lower().strip() == 'c'
+        ):
+            prediction = '(c)'
+        if (
+            'd)' in prediction
+            or 'd )' in prediction
+            or prediction.lower().strip() == 'd'
+        ):
+            prediction = '(d)'
+
+        if (
+            '(a)' in prediction
+            or '(b)' in prediction
+            or '(c)' in prediction
+            or '(d)' in prediction
+        ):
+            prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
+
+        # If the prediction is empty, use dummy '0'
+        if not prediction:
+            prediction = '0'
+
+        # Converting the string answer to a number/list/bool/option
+        try:
+            prediction = eval(prediction)
+        except Exception:
+            LOGGER.warning(
+                f'[TASK] Failed to convert the answer: {prediction}\n{traceback.format_exc()}'
+            )
+            return None  # failed to convert the answer
+
+        # Performing common type conversion
+        if isinstance(prediction, (set, tuple)):
+            prediction = list(prediction)
+            if isinstance(prediction[0], complex):
+                prediction = [tmp.real for tmp in prediction]
+            elif isinstance(prediction[0], Rational):
+                prediction = [float(tmp) for tmp in prediction]
+        elif isinstance(prediction, np.ndarray):
+            prediction = prediction.tolist()
+        else:
+            if isinstance(prediction, complex):
+                prediction = prediction.real
+            elif isinstance(prediction, Rational):
+                prediction = float(prediction)
+
+        return prediction
+
+    def success(self, solution: str) -> bool:
+        """This checks whether the given solution can complete the current task."""
+        # Follow the implementation from TheoremQA
+        # https://github.com/wenhuchen/TheoremQA/blob/123e36beaaa97c01f28a582f13c4f77a6822c199/predict_accuracy.py#L301C9-L317C1
+        prediction = self.extract_answer(solution)
+        LOGGER.info(f'TheoremQA Parsed Prediction: {prediction}')
+        answer_type = self._answer_type
+        gt = self.extract_answer(self.reference)
+
+        if isinstance(prediction, (str, int, float, list)):
+            # Comparing prediction against the reference
+            if answer_type in ['bool', 'option', 'Option']:
+                cur_correct = int(prediction == f'({gt})') or int(prediction == gt)
+            elif answer_type == 'integer':
+                cur_correct = int(compare_two_numbers(prediction, gt))
+            elif answer_type == 'float':
+                cur_correct = int(compare_two_numbers(prediction, gt))
+            elif answer_type in ['list of integer', 'list of float']:
+                cur_correct = int(compare_two_list(prediction, gt))
+        else:
+            cur_correct = 0
+        return bool(cur_correct)