mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
* Add AgentBench. * Load the datasets from HF. Signed-off-by: ifuryst <ifuryst@gmail.com> * Add helper functions. * Add mock executor. Signed-off-by: ifuryst <ifuryst@gmail.com> * Add retriv agent answer cmd. * Adjust the dataset. * Refine test results. Signed-off-by: ifuryst <ifuryst@gmail.com> * Consolidate all AgentBench datasets and scripts into a single CSV dataset. * Refactor dataset source. * Update helper functions. Signed-off-by: ifuryst <ifuryst@gmail.com> * Fix the CRLF problem. Signed-off-by: ifuryst <ifuryst@gmail.com> * Separate the instance's workspace. Signed-off-by: ifuryst <ifuryst@gmail.com> * Add cleanup logic and error handling for sandbox closure. * Normalized dataset Signed-off-by: ifuryst <ifuryst@gmail.com> * Update README. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update the prompt to capture the answer. Signed-off-by: ifuryst <ifuryst@gmail.com> * Refactor script execution paths to use absolute container workspace path. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update AgentBench README. Signed-off-by: ifuryst <ifuryst@gmail.com> * Delete useless functions. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update evaluation/agent_bench/README.md * Add script to summarize test results from JSONL file in AgentBench Signed-off-by: ifuryst <ifuryst@gmail.com> * Delete useless script and codes. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update evaluation/agent_bench/scripts/summarise_results.py --------- Signed-off-by: ifuryst <ifuryst@gmail.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
38 lines
1.2 KiB
Python
38 lines
1.2 KiB
Python
import json
|
|
import sys
|
|
|
|
|
|
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
|
|
passed = []
|
|
failed = []
|
|
with open(res_file_path, 'r') as file:
|
|
for line in file:
|
|
data = json.loads(line.strip())
|
|
instance_id = data['instance_id']
|
|
resolved = False
|
|
if 'test_result' in data and 'result' in data['test_result']:
|
|
resolved = data['test_result']['result']
|
|
if resolved:
|
|
passed.append(instance_id)
|
|
else:
|
|
failed.append(instance_id)
|
|
return passed, failed
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 2:
|
|
print(
|
|
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
|
|
)
|
|
sys.exit(1)
|
|
json_file_path = sys.argv[1]
|
|
passed_tests, failed_tests = extract_test_results(json_file_path)
|
|
succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
|
|
print(
|
|
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
|
|
)
|
|
print('PASSED TESTS:')
|
|
print(passed_tests)
|
|
print('FAILED TESTS:')
|
|
print(failed_tests)
|