Files
OpenHands/evaluation/mint/datatypes.py
Ryan H. Tran 9434bcce48 Support MINT benchmark (MATH, GSM8K subset) (#1955)
* setup boilerplate and README

* setup test script and load dataset

* add temp intg that works

* refactor code

* add solution evaluation through 'fake_user_response_fn'

* finish integrating MATH subset

* Update evaluation/mint/run_infer.py

* Update evaluation/mint/run_infer.sh

* Update opendevin/core/main.py

* remove redudant templates, add eval_note, update README

* use <execute_ipython> tag instead of <execute>

* hardcode AGENT option for run_infer.sh

* Update evaluation/mint/task.py

Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>

* fix: bug no message returned when task's success

* change message to make the agent exit

* import bash abstractmethod

* install all required packages inside sandbox before the agent runs, adjust prompt

* add subset eval folder separation and test for gsm8k

* fix bug in Reasoning task result check, add requirements.txt

* Fix syntax error in evaluation/mint/run_infer.py

* update README, add default values for `SUBSET` and `EVAL_LIMIT`

---------

Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>
Co-authored-by: yufansong <yufan@risingwave-labs.com>
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2024-05-28 07:42:52 +00:00

83 lines
2.3 KiB
Python

import enum
from typing import Any, Dict, Tuple
class TaskState:
def __init__(
self,
finished: bool = False,
success: bool = False,
agent_action_count: dict = None,
terminate_reason: str = None,
latest_output: Dict[str, Any] = None,
):
self.finished = finished
self.success = success
self.agent_action_count: Dict[str, int] = agent_action_count or {
'propose_solution': 0,
'use_tool': 0,
'invalid_action': 0,
}
self.terminate_reason = terminate_reason
self.latest_output = latest_output
def to_dict(self) -> Dict[str, Any]:
return {
'finished': self.finished,
'success': self.success,
'agent_action_count': self.agent_action_count,
'terminate_reason': self.terminate_reason,
'latest_output': self.latest_output,
}
class ParseError(Exception):
pass
class FeedbackType(enum.Enum):
FEEDBACK_WITH_GT = 'feedback_with_gt'
FEEDBACK_WO_GT = 'feedback_wo_gt'
NO_FEEDBACK = 'no_feedback'
class StepOutput:
def __init__(
self,
observation: str = None,
success: bool = False,
extra: Dict[str, Any] = None,
turn_info: Tuple[int, int] = None,
):
self.observation: str = observation
self.success: bool = success
self.extra: Dict[str, Any] = extra
self.turn_info = turn_info
def __repr__(self) -> str:
return self.observation
def to_str(self) -> str:
output = 'Observation:\n'
if self.observation is not None:
output += self.observation + '\n'
else:
if not self.success:
output += 'Your answer is wrong.\n'
if self.turn_info is not None:
n_steps_left, n_propose_solution_left = self.turn_info
output += 'You have {} steps left and {} chances to propose solution left.\n'.format(
n_steps_left, n_propose_solution_left
)
if n_steps_left <= 1:
output += 'You should take the last step to propose a solution.\n'
return output
def to_dict(self) -> Dict[str, Any]:
return {
'observation': self.observation,
'success': self.success,
}