mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
Co-authored-by: Robert Brennan <accounts@rbren.io>
This commit is contained in:
@@ -67,11 +67,11 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
'<uploaded_files>\n'
|
||||
f'/workspace/{workspace_dir_name}\n'
|
||||
'</uploaded_files>\n'
|
||||
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n"
|
||||
f'<pr_description>\n'
|
||||
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:\n\n"
|
||||
f'<issue_description>\n'
|
||||
f'{instance.problem_statement}\n'
|
||||
'</pr_description>\n\n'
|
||||
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?\n'
|
||||
'</issue_description>\n\n'
|
||||
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
|
||||
"I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
|
||||
'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.\n'
|
||||
'Follow these steps to resolve the issue:\n'
|
||||
|
||||
@@ -15,11 +15,25 @@ parser.add_argument(
|
||||
action='store_true',
|
||||
help='Show visualization paths for failed instances',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--only-x-instances',
|
||||
action='store_true',
|
||||
help='Only show instances that are ran by X',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
|
||||
df2 = pd.read_json(args.input_file_2, orient='records', lines=True)
|
||||
|
||||
if args.only_x_instances:
|
||||
instance_ids_1 = set(df1['instance_id'].tolist())
|
||||
print(
|
||||
f'Before removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
|
||||
)
|
||||
df2 = df2[df2['instance_id'].isin(instance_ids_1)]
|
||||
print(
|
||||
f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
|
||||
)
|
||||
|
||||
# Get the intersection of the instance_ids
|
||||
df = pd.merge(df1, df2, on='instance_id', how='inner')
|
||||
@@ -86,7 +100,7 @@ repo_diffs = []
|
||||
for repo in all_repos:
|
||||
x_count = len(x_only_by_repo.get(repo, []))
|
||||
y_count = len(y_only_by_repo.get(repo, []))
|
||||
diff = abs(x_count - y_count)
|
||||
diff = y_count - x_count
|
||||
repo_diffs.append((repo, diff))
|
||||
|
||||
# Sort by diff (descending) and then by repo name
|
||||
@@ -106,7 +120,13 @@ for repo, diff in repo_diffs:
|
||||
repo_color = 'red' if is_significant else 'yellow'
|
||||
|
||||
print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
|
||||
print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
|
||||
print(
|
||||
colored(
|
||||
f'Difference: {diff} instances! (Larger diff = Y better)',
|
||||
repo_color,
|
||||
attrs=['bold'],
|
||||
)
|
||||
)
|
||||
print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
|
||||
if x_instances:
|
||||
print(' ' + str(x_instances))
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from glob import glob
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
@@ -20,6 +21,7 @@ output_md_folder = args.oh_output_file.replace('.jsonl', '.viz')
|
||||
print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
|
||||
|
||||
oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
|
||||
output_dir = os.path.dirname(args.oh_output_file)
|
||||
|
||||
swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
|
||||
if os.path.exists(swebench_eval_file):
|
||||
@@ -57,22 +59,172 @@ def convert_history_to_str(history):
|
||||
return ret
|
||||
|
||||
|
||||
# Load trajectories for resolved instances
|
||||
def load_completions(instance_id: str):
|
||||
global output_dir
|
||||
glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
|
||||
files = sorted(glob(glob_path)) # this is ascending order
|
||||
# pick the last file (last turn)
|
||||
try:
|
||||
file_path = files[-1]
|
||||
except IndexError:
|
||||
# print(f'No files found for instance {instance_id}: files={files}')
|
||||
return None
|
||||
with open(file_path, 'r') as f:
|
||||
result = json.load(f)
|
||||
# create messages
|
||||
messages = result['messages']
|
||||
messages.append(result['response']['choices'][0]['message'])
|
||||
tools = result['kwargs']['tools']
|
||||
return {
|
||||
'messages': messages,
|
||||
'tools': tools,
|
||||
}
|
||||
|
||||
|
||||
def _convert_content(content) -> str:
|
||||
ret = ''
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
assert item['type'] == 'text', 'Only text is supported for now'
|
||||
ret += f'{item["text"]}\n'
|
||||
else:
|
||||
assert isinstance(content, str), 'Only str is supported for now'
|
||||
ret = content
|
||||
return ret
|
||||
|
||||
|
||||
def convert_tool_call_to_string(tool_call: dict) -> str:
|
||||
"""Convert tool call to content in string format."""
|
||||
if 'function' not in tool_call:
|
||||
raise ValueError("Tool call must contain 'function' key.")
|
||||
if 'id' not in tool_call:
|
||||
raise ValueError("Tool call must contain 'id' key.")
|
||||
if 'type' not in tool_call:
|
||||
raise ValueError("Tool call must contain 'type' key.")
|
||||
if tool_call['type'] != 'function':
|
||||
raise ValueError("Tool call type must be 'function'.")
|
||||
|
||||
ret = f"<function={tool_call['function']['name']}>\n"
|
||||
try:
|
||||
args = json.loads(tool_call['function']['arguments'])
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(
|
||||
f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
|
||||
) from e
|
||||
for param_name, param_value in args.items():
|
||||
is_multiline = isinstance(param_value, str) and '\n' in param_value
|
||||
ret += f'<parameter={param_name}>'
|
||||
if is_multiline:
|
||||
ret += '\n'
|
||||
ret += f'{param_value}'
|
||||
if is_multiline:
|
||||
ret += '\n'
|
||||
ret += '</parameter>\n'
|
||||
ret += '</function>'
|
||||
return ret
|
||||
|
||||
|
||||
def format_traj(traj, first_n_turns=None, last_n_turns=None) -> str:
|
||||
output = ''
|
||||
system_message = None
|
||||
|
||||
# Handle system message if present
|
||||
if traj[0]['role'] == 'system':
|
||||
system_message = traj[0]
|
||||
traj = traj[1:]
|
||||
content = _convert_content(system_message['content'])
|
||||
output += "*** System Message that describes the assistant's behavior ***\n"
|
||||
output += f'{content}\n'
|
||||
|
||||
# Merge consecutive user messages first
|
||||
merged_traj = []
|
||||
current_messages = []
|
||||
|
||||
n_turns = len(traj)
|
||||
for i, message in enumerate(traj):
|
||||
# Skip this message if...
|
||||
if (
|
||||
# Case 1: first_n_turns specified and we're past it
|
||||
(first_n_turns is not None and i >= first_n_turns and last_n_turns is None)
|
||||
or
|
||||
# Case 2: last_n_turns specified and we're before it
|
||||
(
|
||||
last_n_turns is not None
|
||||
and i < n_turns - last_n_turns
|
||||
and first_n_turns is None
|
||||
)
|
||||
or
|
||||
# Case 3: both specified and we're in the middle section
|
||||
(
|
||||
first_n_turns is not None
|
||||
and last_n_turns is not None
|
||||
and i >= first_n_turns
|
||||
and i < n_turns - last_n_turns
|
||||
)
|
||||
):
|
||||
continue
|
||||
|
||||
if message['role'] == 'user':
|
||||
current_messages.append(message)
|
||||
else:
|
||||
if current_messages:
|
||||
# Merge all accumulated user messages into one
|
||||
merged_content = '\n'.join(
|
||||
_convert_content(msg['content']) for msg in current_messages
|
||||
)
|
||||
merged_traj.append({'role': 'user', 'content': merged_content})
|
||||
current_messages = []
|
||||
merged_traj.append(message)
|
||||
|
||||
# Don't forget to handle any remaining user messages
|
||||
if current_messages:
|
||||
merged_content = '\n'.join(
|
||||
_convert_content(msg['content']) for msg in current_messages
|
||||
)
|
||||
merged_traj.append({'role': 'user', 'content': merged_content})
|
||||
|
||||
# Now process the merged trajectory
|
||||
for i, message in enumerate(merged_traj):
|
||||
role, content = message['role'], message['content']
|
||||
content = _convert_content(content) if isinstance(content, list) else content
|
||||
turn_id = i // 2 + 1
|
||||
output += '-' * 100 + '\n'
|
||||
output += f'*** Turn {turn_id} - {role.upper() if role != "tool" else "TOOL EXECUTION RESULT"} ***\n'
|
||||
|
||||
if role == 'user':
|
||||
output += f'{content}\n'
|
||||
elif role == 'tool':
|
||||
output += f'{content}\n'
|
||||
elif role == 'assistant':
|
||||
output += f'{content}\n'
|
||||
if (
|
||||
'tool_calls' in message
|
||||
and message['tool_calls'] is not None
|
||||
and len(message['tool_calls']) > 0
|
||||
):
|
||||
for toolcall_id, tool_call in enumerate(message['tool_calls']):
|
||||
output += f'### Tool Call {toolcall_id}\n'
|
||||
output += f'{convert_tool_call_to_string(tool_call)}\n'
|
||||
else:
|
||||
raise ValueError(f'Unexpected role: {role}')
|
||||
|
||||
output += '-' * 100 + '\n'
|
||||
return output
|
||||
|
||||
|
||||
def write_row_to_md_file(row, instance_id_to_test_result):
|
||||
if 'git_patch' in row:
|
||||
model_patch = row['git_patch']
|
||||
elif 'test_result' in row and 'git_patch' in row['test_result']:
|
||||
model_patch = row['test_result']['git_patch']
|
||||
else:
|
||||
raise ValueError(f'Row {row} does not have a git_patch')
|
||||
print(f'Row {row} does not have a git_patch')
|
||||
return
|
||||
|
||||
test_output = None
|
||||
if row['instance_id'] in instance_id_to_test_result:
|
||||
report = instance_id_to_test_result[row['instance_id']].get('report', {})
|
||||
resolved = report.get('resolved', False)
|
||||
test_output = instance_id_to_test_result[row['instance_id']].get(
|
||||
'test_output', None
|
||||
)
|
||||
elif 'report' in row and row['report'] is not None:
|
||||
# Use result from output.jsonl FIRST if available.
|
||||
if 'report' in row and row['report'] is not None:
|
||||
if not isinstance(row['report'], dict):
|
||||
resolved = None
|
||||
print(
|
||||
@@ -80,6 +232,12 @@ def write_row_to_md_file(row, instance_id_to_test_result):
|
||||
)
|
||||
else:
|
||||
resolved = row['report'].get('resolved', False)
|
||||
elif row['instance_id'] in instance_id_to_test_result:
|
||||
report = instance_id_to_test_result[row['instance_id']].get('report', {})
|
||||
resolved = report.get('resolved', False)
|
||||
test_output = instance_id_to_test_result[row['instance_id']].get(
|
||||
'test_output', None
|
||||
)
|
||||
else:
|
||||
resolved = None
|
||||
|
||||
@@ -88,6 +246,8 @@ def write_row_to_md_file(row, instance_id_to_test_result):
|
||||
os.makedirs(output_md_folder, exist_ok=True)
|
||||
filepath = os.path.join(output_md_folder, filename)
|
||||
|
||||
completions = load_completions(instance_id)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(f'# {instance_id} (resolved: {resolved})\n')
|
||||
|
||||
@@ -97,7 +257,12 @@ def write_row_to_md_file(row, instance_id_to_test_result):
|
||||
f.write(json.dumps(row['metadata'], indent=2))
|
||||
f.write('\n```\n')
|
||||
|
||||
# Trajectory
|
||||
# Completion
|
||||
if completions is not None:
|
||||
f.write('## Completion\n')
|
||||
traj = completions['messages']
|
||||
f.write(format_traj(traj))
|
||||
|
||||
f.write('## History\n')
|
||||
f.write(convert_history_to_str(row['history']))
|
||||
|
||||
|
||||
@@ -207,12 +207,13 @@ with open(args.input_file, 'r') as infile:
|
||||
for line in tqdm(infile, desc='Checking for changes'):
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
current_report = data.get('report', {})
|
||||
new_report = instance_id_to_status[instance_id]
|
||||
if current_report != new_report:
|
||||
needs_update = True
|
||||
break
|
||||
current_report = data.get('report', {})
|
||||
new_report = instance_id_to_status[
|
||||
instance_id
|
||||
] # if no report, it's not resolved
|
||||
if current_report != new_report:
|
||||
needs_update = True
|
||||
break
|
||||
|
||||
if not needs_update:
|
||||
print('No updates detected. Skipping file update.')
|
||||
@@ -234,6 +235,5 @@ with open(args.input_file + '.bak', 'r') as infile, open(
|
||||
for line in tqdm(infile, desc='Updating output file'):
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
data['report'] = instance_id_to_status[instance_id]
|
||||
data['report'] = instance_id_to_status[instance_id]
|
||||
outfile.write(json.dumps(data) + '\n')
|
||||
|
||||
@@ -76,7 +76,7 @@ echo "Running SWE-bench evaluation"
|
||||
echo "=============================================================="
|
||||
|
||||
RUN_ID=$(date +"%Y%m%d_%H%M%S")
|
||||
N_PROCESS=16
|
||||
N_PROCESS=4
|
||||
|
||||
if [ -z "$INSTANCE_ID" ]; then
|
||||
echo "Running SWE-bench evaluation on the whole input file..."
|
||||
@@ -87,7 +87,7 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--split "$SPLIT" \
|
||||
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
||||
--timeout 1800 \
|
||||
--timeout 3600 \
|
||||
--cache_level instance \
|
||||
--max_workers $N_PROCESS \
|
||||
--run_id $RUN_ID
|
||||
@@ -133,7 +133,7 @@ else
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--split "$SPLIT" \
|
||||
--predictions_path $SWEBENCH_FORMAT_JSONL \
|
||||
--timeout 1800 \
|
||||
--timeout 3600 \
|
||||
--instance_ids $INSTANCE_ID \
|
||||
--cache_level instance \
|
||||
--max_workers $N_PROCESS \
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
|
||||
<IMPORTANT>
|
||||
* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
|
||||
* You should start exploring the file system with your view command, unless you need to explore more deeply.
|
||||
* When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.
|
||||
* You MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
|
||||
* The assistant MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
|
||||
</IMPORTANT>
|
||||
|
||||
Reference in New Issue
Block a user