mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
* setup boilerplate and README * setup test script and load dataset * add temp intg that works * refactor code * add solution evaluation through 'fake_user_response_fn' * finish integrating MATH subset * Update evaluation/mint/run_infer.py * Update evaluation/mint/run_infer.sh * Update opendevin/core/main.py * remove redudant templates, add eval_note, update README * use <execute_ipython> tag instead of <execute> * hardcode AGENT option for run_infer.sh * Update evaluation/mint/task.py Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> * fix: bug no message returned when task's success * change message to make the agent exit * import bash abstractmethod * install all required packages inside sandbox before the agent runs, adjust prompt * add subset eval folder separation and test for gsm8k * fix bug in Reasoning task result check, add requirements.txt * Fix syntax error in evaluation/mint/run_infer.py * update README, add default values for `SUBSET` and `EVAL_LIMIT` --------- Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com> Co-authored-by: yufansong <yufan@risingwave-labs.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
122 lines
4.0 KiB
Python
122 lines
4.0 KiB
Python
import json
|
|
import logging
|
|
import os
|
|
from abc import ABC, abstractmethod
|
|
from typing import List, Optional, Tuple
|
|
|
|
from utils import load_file
|
|
|
|
LOGGER = logging.getLogger('MINT')
|
|
|
|
|
|
class Task(ABC):
|
|
"""Base class for a task instance."""
|
|
|
|
task_name: str = 'base'
|
|
in_context_example_dir = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
'in_context_examples',
|
|
)
|
|
|
|
def __init__(self, **kwargs) -> None:
|
|
if 'loaded_history' in kwargs:
|
|
self.loaded_history = kwargs['loaded_history']
|
|
else:
|
|
self.loaded_history = None
|
|
# pre-load the in-context example
|
|
task_dir = os.path.join(self.in_context_example_dir, self.task_name)
|
|
self._in_context_example = {
|
|
'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
|
|
}
|
|
self.metadata = {}
|
|
|
|
@property
|
|
def task_id(self) -> str:
|
|
"""Return the task id."""
|
|
assert hasattr(self, '_id'), 'Task does not have an id.'
|
|
return self._id
|
|
|
|
def in_context_example(
|
|
self, use_tool: bool = True, with_feedback: bool = False
|
|
) -> str:
|
|
"""Return the in-context example for the task."""
|
|
if use_tool and not with_feedback:
|
|
return self._in_context_example['with_tool']
|
|
else:
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def prompt(self) -> str:
|
|
"""Return the task prompt."""
|
|
assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
|
|
return self._prompt
|
|
|
|
@property
|
|
def reference(self) -> str:
|
|
"""Return the reference solution for the task."""
|
|
assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
|
|
return self._reference
|
|
|
|
@abstractmethod
|
|
def extract_answer(self, solution: str) -> Optional[str]:
|
|
"""Extract the answer from the given solution."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def success(self, solution: str) -> bool:
|
|
"""This checks whether the given solution can complete the current task.
|
|
|
|
Can be used to provide binary feedback.
|
|
"""
|
|
answer = self.extract_answer(solution)
|
|
return answer == self.reference
|
|
|
|
@classmethod
|
|
def load_tasks(cls, path: str) -> Tuple[List['Task'], int]:
|
|
"""Load all the tasks from a given jsonl file."""
|
|
assert path.endswith('.jsonl') or path.endswith('.json')
|
|
with open(path, 'r') as f:
|
|
tasks = [cls(**json.loads(line)) for line in f.readlines()]
|
|
LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
|
|
return tasks, len(tasks)
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert the task to a dictionary."""
|
|
return {
|
|
'task_name': self.task_name,
|
|
'task_id': self.task_id,
|
|
'prompt': self.prompt,
|
|
'reference': self.reference,
|
|
'metadata': self.metadata,
|
|
}
|
|
|
|
|
|
class ReasoningTask(Task):
|
|
task_name = 'reasoning'
|
|
|
|
def __init__(self, id: str, prompt: str, reference: str, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self._id = id
|
|
self._prompt = prompt.strip()
|
|
self._reference = str(reference).strip().lower()
|
|
|
|
def extract_answer(self, solution: str) -> Optional[str]:
|
|
"""Extract the answer from the given solution."""
|
|
return solution.lower().strip()
|
|
|
|
def compare_w_digits(self, reference: str, answer: str) -> bool:
|
|
"""Compare the reference and answer with digits."""
|
|
# if reference can and answer can both be converted to floats by float()
|
|
try:
|
|
float(reference)
|
|
float(answer)
|
|
return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
|
|
except ValueError:
|
|
return reference in answer
|
|
except Exception:
|
|
raise ValueError(f'Cannot compare {reference} and {answer}')
|
|
|
|
def success(self, solution: str) -> bool:
|
|
answer = self.extract_answer(solution)
|
|
return self.compare_w_digits(self._reference, answer)
|