From 391200510c9b5edfa447eed9722f1841792137eb Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 28 Jan 2025 09:52:57 -0500 Subject: [PATCH] fix: revert #5506 for SWE-Bench performance regression (#6491) Co-authored-by: Robert Brennan --- evaluation/benchmarks/swe_bench/run_infer.py | 8 +- .../swe_bench/scripts/eval/compare_outputs.py | 24 ++- .../scripts/eval/convert_oh_output_to_md.py | 183 +++++++++++++++++- .../scripts/eval/update_output_with_eval.py | 16 +- .../swe_bench/scripts/eval_infer.sh | 6 +- .../codeact_agent/prompts/system_prompt.j2 | 3 +- 6 files changed, 212 insertions(+), 28 deletions(-) diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 7d13d546d8..959576ffcd 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -67,11 +67,11 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): '\n' f'/workspace/{workspace_dir_name}\n' '\n' - f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n" - f'\n' + f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n" + f'\n' f'{instance.problem_statement}\n' - '\n\n' - 'Can you help me implement the necessary changes to the repository so that the requirements specified in the are met?\n' + '\n\n' + 'Can you help me implement the necessary changes to the repository so that the requirements specified in the are met?\n' "I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way!\n" 'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the is satisfied.\n' 'Follow these steps to resolve the issue:\n' diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py index 0dc9552a25..2099a70b38 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py @@ -15,11 +15,25 @@ parser.add_argument( action='store_true', help='Show visualization paths for failed instances', ) +parser.add_argument( + '--only-x-instances', + action='store_true', + help='Only show instances that are ran by X', +) args = parser.parse_args() df1 = pd.read_json(args.input_file_1, orient='records', lines=True) df2 = pd.read_json(args.input_file_2, orient='records', lines=True) +if args.only_x_instances: + instance_ids_1 = set(df1['instance_id'].tolist()) + print( + f'Before removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances' + ) + df2 = df2[df2['instance_id'].isin(instance_ids_1)] + print( + f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances' + ) # Get the intersection of the instance_ids df = pd.merge(df1, df2, on='instance_id', how='inner') @@ -86,7 +100,7 @@ repo_diffs = [] for repo in all_repos: x_count = len(x_only_by_repo.get(repo, [])) y_count = len(y_only_by_repo.get(repo, [])) - diff = abs(x_count - y_count) + diff = y_count - x_count repo_diffs.append((repo, diff)) # Sort by diff (descending) and then by repo name @@ -106,7 +120,13 @@ for repo, diff in repo_diffs: repo_color = 'red' if is_significant else 'yellow' print(f"\n{colored(repo, repo_color, attrs=['bold'])}:") - print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold'])) + print( + colored( + f'Difference: {diff} instances! (Larger diff = Y better)', + repo_color, + attrs=['bold'], + ) + ) print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green')) if x_instances: print(' ' + str(x_instances)) diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py index 97698feba3..0e34b8c0c5 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py @@ -4,6 +4,7 @@ import argparse import json import os +from glob import glob import pandas as pd from tqdm import tqdm @@ -20,6 +21,7 @@ output_md_folder = args.oh_output_file.replace('.jsonl', '.viz') print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}') oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True) +output_dir = os.path.dirname(args.oh_output_file) swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl') if os.path.exists(swebench_eval_file): @@ -57,22 +59,172 @@ def convert_history_to_str(history): return ret +# Load trajectories for resolved instances +def load_completions(instance_id: str): + global output_dir + glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json') + files = sorted(glob(glob_path)) # this is ascending order + # pick the last file (last turn) + try: + file_path = files[-1] + except IndexError: + # print(f'No files found for instance {instance_id}: files={files}') + return None + with open(file_path, 'r') as f: + result = json.load(f) + # create messages + messages = result['messages'] + messages.append(result['response']['choices'][0]['message']) + tools = result['kwargs']['tools'] + return { + 'messages': messages, + 'tools': tools, + } + + +def _convert_content(content) -> str: + ret = '' + if isinstance(content, list): + for item in content: + assert item['type'] == 'text', 'Only text is supported for now' + ret += f'{item["text"]}\n' + else: + assert isinstance(content, str), 'Only str is supported for now' + ret = content + return ret + + +def convert_tool_call_to_string(tool_call: dict) -> str: + """Convert tool call to content in string format.""" + if 'function' not in tool_call: + raise ValueError("Tool call must contain 'function' key.") + if 'id' not in tool_call: + raise ValueError("Tool call must contain 'id' key.") + if 'type' not in tool_call: + raise ValueError("Tool call must contain 'type' key.") + if tool_call['type'] != 'function': + raise ValueError("Tool call type must be 'function'.") + + ret = f"\n" + try: + args = json.loads(tool_call['function']['arguments']) + except json.JSONDecodeError as e: + raise ValueError( + f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}" + ) from e + for param_name, param_value in args.items(): + is_multiline = isinstance(param_value, str) and '\n' in param_value + ret += f'' + if is_multiline: + ret += '\n' + ret += f'{param_value}' + if is_multiline: + ret += '\n' + ret += '\n' + ret += '' + return ret + + +def format_traj(traj, first_n_turns=None, last_n_turns=None) -> str: + output = '' + system_message = None + + # Handle system message if present + if traj[0]['role'] == 'system': + system_message = traj[0] + traj = traj[1:] + content = _convert_content(system_message['content']) + output += "*** System Message that describes the assistant's behavior ***\n" + output += f'{content}\n' + + # Merge consecutive user messages first + merged_traj = [] + current_messages = [] + + n_turns = len(traj) + for i, message in enumerate(traj): + # Skip this message if... + if ( + # Case 1: first_n_turns specified and we're past it + (first_n_turns is not None and i >= first_n_turns and last_n_turns is None) + or + # Case 2: last_n_turns specified and we're before it + ( + last_n_turns is not None + and i < n_turns - last_n_turns + and first_n_turns is None + ) + or + # Case 3: both specified and we're in the middle section + ( + first_n_turns is not None + and last_n_turns is not None + and i >= first_n_turns + and i < n_turns - last_n_turns + ) + ): + continue + + if message['role'] == 'user': + current_messages.append(message) + else: + if current_messages: + # Merge all accumulated user messages into one + merged_content = '\n'.join( + _convert_content(msg['content']) for msg in current_messages + ) + merged_traj.append({'role': 'user', 'content': merged_content}) + current_messages = [] + merged_traj.append(message) + + # Don't forget to handle any remaining user messages + if current_messages: + merged_content = '\n'.join( + _convert_content(msg['content']) for msg in current_messages + ) + merged_traj.append({'role': 'user', 'content': merged_content}) + + # Now process the merged trajectory + for i, message in enumerate(merged_traj): + role, content = message['role'], message['content'] + content = _convert_content(content) if isinstance(content, list) else content + turn_id = i // 2 + 1 + output += '-' * 100 + '\n' + output += f'*** Turn {turn_id} - {role.upper() if role != "tool" else "TOOL EXECUTION RESULT"} ***\n' + + if role == 'user': + output += f'{content}\n' + elif role == 'tool': + output += f'{content}\n' + elif role == 'assistant': + output += f'{content}\n' + if ( + 'tool_calls' in message + and message['tool_calls'] is not None + and len(message['tool_calls']) > 0 + ): + for toolcall_id, tool_call in enumerate(message['tool_calls']): + output += f'### Tool Call {toolcall_id}\n' + output += f'{convert_tool_call_to_string(tool_call)}\n' + else: + raise ValueError(f'Unexpected role: {role}') + + output += '-' * 100 + '\n' + return output + + def write_row_to_md_file(row, instance_id_to_test_result): if 'git_patch' in row: model_patch = row['git_patch'] elif 'test_result' in row and 'git_patch' in row['test_result']: model_patch = row['test_result']['git_patch'] else: - raise ValueError(f'Row {row} does not have a git_patch') + print(f'Row {row} does not have a git_patch') + return test_output = None - if row['instance_id'] in instance_id_to_test_result: - report = instance_id_to_test_result[row['instance_id']].get('report', {}) - resolved = report.get('resolved', False) - test_output = instance_id_to_test_result[row['instance_id']].get( - 'test_output', None - ) - elif 'report' in row and row['report'] is not None: + # Use result from output.jsonl FIRST if available. + if 'report' in row and row['report'] is not None: if not isinstance(row['report'], dict): resolved = None print( @@ -80,6 +232,12 @@ def write_row_to_md_file(row, instance_id_to_test_result): ) else: resolved = row['report'].get('resolved', False) + elif row['instance_id'] in instance_id_to_test_result: + report = instance_id_to_test_result[row['instance_id']].get('report', {}) + resolved = report.get('resolved', False) + test_output = instance_id_to_test_result[row['instance_id']].get( + 'test_output', None + ) else: resolved = None @@ -88,6 +246,8 @@ def write_row_to_md_file(row, instance_id_to_test_result): os.makedirs(output_md_folder, exist_ok=True) filepath = os.path.join(output_md_folder, filename) + completions = load_completions(instance_id) + with open(filepath, 'w') as f: f.write(f'# {instance_id} (resolved: {resolved})\n') @@ -97,7 +257,12 @@ def write_row_to_md_file(row, instance_id_to_test_result): f.write(json.dumps(row['metadata'], indent=2)) f.write('\n```\n') - # Trajectory + # Completion + if completions is not None: + f.write('## Completion\n') + traj = completions['messages'] + f.write(format_traj(traj)) + f.write('## History\n') f.write(convert_history_to_str(row['history'])) diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py index f8527acd7a..cf05503400 100644 --- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py @@ -207,12 +207,13 @@ with open(args.input_file, 'r') as infile: for line in tqdm(infile, desc='Checking for changes'): data = json.loads(line) instance_id = data['instance_id'] - if instance_id in instance_id_to_status: - current_report = data.get('report', {}) - new_report = instance_id_to_status[instance_id] - if current_report != new_report: - needs_update = True - break + current_report = data.get('report', {}) + new_report = instance_id_to_status[ + instance_id + ] # if no report, it's not resolved + if current_report != new_report: + needs_update = True + break if not needs_update: print('No updates detected. Skipping file update.') @@ -234,6 +235,5 @@ with open(args.input_file + '.bak', 'r') as infile, open( for line in tqdm(infile, desc='Updating output file'): data = json.loads(line) instance_id = data['instance_id'] - if instance_id in instance_id_to_status: - data['report'] = instance_id_to_status[instance_id] + data['report'] = instance_id_to_status[instance_id] outfile.write(json.dumps(data) + '\n') diff --git a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh index 13ef271671..b39986615e 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh @@ -76,7 +76,7 @@ echo "Running SWE-bench evaluation" echo "==============================================================" RUN_ID=$(date +"%Y%m%d_%H%M%S") -N_PROCESS=16 +N_PROCESS=4 if [ -z "$INSTANCE_ID" ]; then echo "Running SWE-bench evaluation on the whole input file..." @@ -87,7 +87,7 @@ if [ -z "$INSTANCE_ID" ]; then --dataset_name "$DATASET_NAME" \ --split "$SPLIT" \ --predictions_path $SWEBENCH_FORMAT_JSONL \ - --timeout 1800 \ + --timeout 3600 \ --cache_level instance \ --max_workers $N_PROCESS \ --run_id $RUN_ID @@ -133,7 +133,7 @@ else --dataset_name "$DATASET_NAME" \ --split "$SPLIT" \ --predictions_path $SWEBENCH_FORMAT_JSONL \ - --timeout 1800 \ + --timeout 3600 \ --instance_ids $INSTANCE_ID \ --cache_level instance \ --max_workers $N_PROCESS \ diff --git a/openhands/agenthub/codeact_agent/prompts/system_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/system_prompt.j2 index 84d1d6a4f7..325392f2e6 100644 --- a/openhands/agenthub/codeact_agent/prompts/system_prompt.j2 +++ b/openhands/agenthub/codeact_agent/prompts/system_prompt.j2 @@ -1,7 +1,6 @@ You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks. * If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it. -* You should start exploring the file system with your view command, unless you need to explore more deeply. * When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise. -* You MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior. +* The assistant MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.