mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 23:38:08 -05:00
feat(eval): better visualization for comparing two swe-bench runs (#5993)
This commit is contained in:
@@ -1,13 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
from termcolor import colored
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compare two swe_bench output JSONL files and print the resolved diff'
|
||||
)
|
||||
parser.add_argument('input_file_1', type=str)
|
||||
parser.add_argument('input_file_2', type=str)
|
||||
parser.add_argument(
|
||||
'--show-paths',
|
||||
action='store_true',
|
||||
help='Show visualization paths for failed instances',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
|
||||
@@ -58,10 +65,60 @@ df_diff_y_only = df_diff[~df_diff['resolved_x'] & df_diff['resolved_y']].sort_va
|
||||
print(f'# y resolved but x not={df_diff_y_only.shape[0]}')
|
||||
print(df_diff_y_only[['instance_id', 'report_x', 'report_y']])
|
||||
# get instance_id from df_diff_y_only
|
||||
print('-' * 100)
|
||||
print('Instances that x resolved but y not:')
|
||||
print(df_diff_x_only['instance_id'].tolist())
|
||||
|
||||
x_only_by_repo = {}
|
||||
for instance_id in df_diff_x_only['instance_id'].tolist():
|
||||
repo = instance_id.split('__')[0]
|
||||
x_only_by_repo.setdefault(repo, []).append(instance_id)
|
||||
y_only_by_repo = {}
|
||||
for instance_id in df_diff_y_only['instance_id'].tolist():
|
||||
repo = instance_id.split('__')[0]
|
||||
y_only_by_repo.setdefault(repo, []).append(instance_id)
|
||||
|
||||
print('-' * 100)
|
||||
print('Instances that y resolved but x not:')
|
||||
print(df_diff_y_only['instance_id'].tolist())
|
||||
print(
|
||||
colored('Repository comparison (x resolved vs y resolved):', 'cyan', attrs=['bold'])
|
||||
)
|
||||
all_repos = sorted(set(list(x_only_by_repo.keys()) + list(y_only_by_repo.keys())))
|
||||
|
||||
# Calculate diffs and sort repos by diff magnitude
|
||||
repo_diffs = []
|
||||
for repo in all_repos:
|
||||
x_count = len(x_only_by_repo.get(repo, []))
|
||||
y_count = len(y_only_by_repo.get(repo, []))
|
||||
diff = abs(x_count - y_count)
|
||||
repo_diffs.append((repo, diff))
|
||||
|
||||
# Sort by diff (descending) and then by repo name
|
||||
repo_diffs.sort(key=lambda x: (-x[1], x[0]))
|
||||
threshold = max(
|
||||
3, sum(d[1] for d in repo_diffs) / len(repo_diffs) * 1.5 if repo_diffs else 0
|
||||
)
|
||||
|
||||
x_input_file_folder = os.path.join(os.path.dirname(args.input_file_1), 'output.viz')
|
||||
|
||||
for repo, diff in repo_diffs:
|
||||
x_instances = x_only_by_repo.get(repo, [])
|
||||
y_instances = y_only_by_repo.get(repo, [])
|
||||
|
||||
# Determine if this repo has a significant diff
|
||||
is_significant = diff >= threshold
|
||||
repo_color = 'red' if is_significant else 'yellow'
|
||||
print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
|
||||
|
||||
print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
|
||||
print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
|
||||
if x_instances:
|
||||
print(' ' + str(x_instances))
|
||||
print(colored(f'Y resolved but X failed: ({len(y_instances)} instances)', 'red'))
|
||||
if y_instances:
|
||||
print(' ' + str(y_instances))
|
||||
if args.show_paths:
|
||||
print(
|
||||
colored(' Visualization path for X failed:', 'cyan', attrs=['bold'])
|
||||
)
|
||||
for instance_id in y_instances:
|
||||
instance_file = os.path.join(
|
||||
x_input_file_folder, f'false.{instance_id}.md'
|
||||
)
|
||||
print(f' {instance_file}')
|
||||
|
||||
Reference in New Issue
Block a user