feat(eval): better visualization for comparing two swe-bench runs (#5993)

2026-01-10 23:38:08 -05:00 · 2025-01-02 21:36:51 -05:00
parent c567c11267
commit 61ebec9ff7
1 changed files with 62 additions and 5 deletions
--- a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
@@ -1,13 +1,20 @@
 #!/usr/bin/env python3
 import argparse
+import os

 import pandas as pd
+from termcolor import colored

 parser = argparse.ArgumentParser(
    description='Compare two swe_bench output JSONL files and print the resolved diff'
 )
 parser.add_argument('input_file_1', type=str)
 parser.add_argument('input_file_2', type=str)
+parser.add_argument(
+    '--show-paths',
+    action='store_true',
+    help='Show visualization paths for failed instances',
+)
 args = parser.parse_args()

 df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
@@ -58,10 +65,60 @@ df_diff_y_only = df_diff[~df_diff['resolved_x'] & df_diff['resolved_y']].sort_va
 print(f'# y resolved but x not={df_diff_y_only.shape[0]}')
 print(df_diff_y_only[['instance_id', 'report_x', 'report_y']])
 # get instance_id from df_diff_y_only
-print('-' * 100)
-print('Instances that x resolved but y not:')
-print(df_diff_x_only['instance_id'].tolist())
+
+x_only_by_repo = {}
+for instance_id in df_diff_x_only['instance_id'].tolist():
+    repo = instance_id.split('__')[0]
+    x_only_by_repo.setdefault(repo, []).append(instance_id)
+y_only_by_repo = {}
+for instance_id in df_diff_y_only['instance_id'].tolist():
+    repo = instance_id.split('__')[0]
+    y_only_by_repo.setdefault(repo, []).append(instance_id)

 print('-' * 100)
-print('Instances that y resolved but x not:')
-print(df_diff_y_only['instance_id'].tolist())
+print(
+    colored('Repository comparison (x resolved vs y resolved):', 'cyan', attrs=['bold'])
+)
+all_repos = sorted(set(list(x_only_by_repo.keys()) + list(y_only_by_repo.keys())))
+
+# Calculate diffs and sort repos by diff magnitude
+repo_diffs = []
+for repo in all_repos:
+    x_count = len(x_only_by_repo.get(repo, []))
+    y_count = len(y_only_by_repo.get(repo, []))
+    diff = abs(x_count - y_count)
+    repo_diffs.append((repo, diff))
+
+# Sort by diff (descending) and then by repo name
+repo_diffs.sort(key=lambda x: (-x[1], x[0]))
+threshold = max(
+    3, sum(d[1] for d in repo_diffs) / len(repo_diffs) * 1.5 if repo_diffs else 0
+)
+
+x_input_file_folder = os.path.join(os.path.dirname(args.input_file_1), 'output.viz')
+
+for repo, diff in repo_diffs:
+    x_instances = x_only_by_repo.get(repo, [])
+    y_instances = y_only_by_repo.get(repo, [])
+
+    # Determine if this repo has a significant diff
+    is_significant = diff >= threshold
+    repo_color = 'red' if is_significant else 'yellow'
+    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
+
+    print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
+    print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
+    if x_instances:
+        print('  ' + str(x_instances))
+    print(colored(f'Y resolved but X failed: ({len(y_instances)} instances)', 'red'))
+    if y_instances:
+        print('  ' + str(y_instances))
+        if args.show_paths:
+            print(
+                colored('    Visualization path for X failed:', 'cyan', attrs=['bold'])
+            )
+            for instance_id in y_instances:
+                instance_file = os.path.join(
+                    x_input_file_folder, f'false.{instance_id}.md'
+                )
+                print(f'    {instance_file}')