Files
OpenHands/evaluation/benchmarks/swe_bench/loc_eval/loc_evaluator.py
xhguo7 9388fef0ef feat(eval): loc acc evaluation (#8515)
Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
Co-authored-by: mamoodi <mamoodiha@gmail.com>
2025-07-11 03:22:35 +08:00

1007 lines
36 KiB
Python

import argparse
import ast
import json
import os
import re
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from evaluation.benchmarks.swe_bench.loc_eval.loc_utils import LocMeta
from evaluation.benchmarks.swe_bench.run_infer import filter_dataset
from evaluation.utils.shared import prepare_dataset
from openhands.core.logger import openhands_logger as logger
class LocEvaluator:
def __init__(self, args):
"""
Localization evaluation.
Args:
args: all main arguments
"""
# Config
self.args = args
self.eval_dir = args.eval_dir
self.eval_task_success = self._check_if_to_eval_success()
self.sandbox_root = '/workspace'
self.agent_turn_num = -1
self.max_agent_turn = args.max_infer_turn
self.align_failed_with_max_iter = args.align_with_max
# Data
self.instance = None
self.trajectory = None
# Localization
self.localizer = LocMeta(args.dataset, args.split)
self.gold_loc = {'file': [], 'function': []}
self.agent_loc = {
'gold loc': {'file': [], 'function': []},
'agent loc': {'file': [], 'function': []},
'turn index': {'file': [], 'function': []},
'loc progress': {'file': [], 'function': []},
}
# Task success tracking
self.task_resolved = False
# Cost
self.cost_summary = {'total_cost': 0.0, 'avg_cost': 0.0, 'details': {}}
# Save
self.save_dir = os.path.join(args.save_dir, 'loc_eval_results')
self._init_dir(self.save_dir)
self.all_eval_results = {}
self.overall_eval = {}
def _init_config(self):
# Data
self.instance = None
self.gold_loc = {'file': [], 'function': []}
self.trajectory = None
self.agent_turn_num = -1
# Localization
self.agent_loc = {
'gold loc': {'file': [], 'function': []},
'agent loc': {'file': [], 'function': []},
'turn index': {'file': [], 'function': []},
'loc progress': {'file': [], 'function': []},
}
# Task success tracking
self.task_resolved = False
def _init_dir(self, directory_path):
"""
Check if a directory exists and create it if it doesn't.
Args:
directory_path (str): Path to the directory to check/create
Returns:
bool: True if directory already existed, False if it was created
"""
if os.path.exists(directory_path):
if not os.path.isdir(directory_path):
raise NotADirectoryError(
f'Path exists but is not a directory: {directory_path}'
)
return True
else:
os.makedirs(directory_path)
return False
def _check_if_to_eval_success(self):
"""Check if post-evaluation outputs exist"""
if not os.path.isdir(self.eval_dir):
return False
else:
return True
def _compute_avg_over_all(self):
"""Compute average loc evaluations over all instances"""
macro_la_file, micro_la_file = 0, 0
macro_la_func, micro_la_func = 0, 0
resolve_rate = 0
macro_avg_file_idx, macro_avg_func_idx = 0, 0
micro_avg_file_idx, micro_avg_func_idx = 0, 0
avg_resolve_idx = 0
total_instance_num = len(self.all_eval_results)
for instance_id in self.all_eval_results:
curr_eval_result = self.all_eval_results[instance_id]['final_eval']
# File
macro_la_file += curr_eval_result['localization']['loc_acc (%)'][
'la_file (%)'
]['la_file_macro']
micro_la_file += curr_eval_result['localization']['loc_acc (%)'][
'la_file (%)'
]['la_file_micro']
macro_avg_file_idx += curr_eval_result['localization']['turn_idx']['file'][
'macro'
]
micro_avg_file_idx += curr_eval_result['localization']['turn_idx']['file'][
'micro'
]
# Function
macro_la_func += curr_eval_result['localization']['loc_acc (%)'][
'la_func (%)'
]['la_func_macro']
micro_la_func += curr_eval_result['localization']['loc_acc (%)'][
'la_func (%)'
]['la_func_micro']
macro_avg_func_idx += curr_eval_result['localization']['turn_idx'][
'function'
]['macro']
micro_avg_func_idx += curr_eval_result['localization']['turn_idx'][
'function'
]['micro']
if self.eval_task_success:
if curr_eval_result['task_success']['resolved']:
resolve_rate += 1
avg_resolve_idx += curr_eval_result['task_success']['resolve_index']
else:
avg_resolve_idx += self.max_agent_turn
# Average
macro_la_file = macro_la_file / total_instance_num
micro_la_file = micro_la_file / total_instance_num
macro_la_func = macro_la_func / total_instance_num
micro_la_func = micro_la_func / total_instance_num
macro_avg_file_idx = macro_avg_file_idx / total_instance_num
micro_avg_file_idx = micro_avg_file_idx / total_instance_num
macro_avg_func_idx = macro_avg_func_idx / total_instance_num
micro_avg_func_idx = micro_avg_func_idx / total_instance_num
if self.eval_task_success:
resolve_rate = resolve_rate / total_instance_num * 100
avg_resolve_idx = avg_resolve_idx / total_instance_num
# Cost metric
total_cost, avg_cost = 0.0, 0.0
for instance_key in self.cost_summary['details']:
total_cost += self.cost_summary['details'][instance_key]
avg_cost = total_cost / len(self.cost_summary['details'])
self.cost_summary['total_cost'] = total_cost
self.cost_summary['avg_cost'] = avg_cost
self.overall_eval = {
'la_file (%)': {'macro': macro_la_file, 'micro': micro_la_file},
'la_func (%)': {'macro': macro_la_func, 'micro': micro_la_func},
'resolve_rate (%)': resolve_rate if self.eval_task_success else None,
'loc_file_idx (turn idx)': {
'macro': macro_avg_file_idx,
'micro': micro_avg_file_idx,
},
'loc_func_idx (turn idx)': {
'macro': macro_avg_func_idx,
'micro': micro_avg_func_idx,
},
'resolve_idx (turn idx)': avg_resolve_idx
if self.eval_task_success
else None,
'max_turn_limit': self.max_agent_turn,
'total_instance_num': total_instance_num,
'cost_summary': self.cost_summary,
}
self._write_to_json(self.overall_eval, 'overall_eval.json')
def _save_to_eval_dicts(self, agent_trajectory: dict):
# Current instancec
self._write_to_json(
agent_trajectory, f'loc__instance_{self.instance.instance_id}.json'
)
# All instances
self.all_eval_results[self.instance.instance_id] = agent_trajectory
self._write_to_json(self.all_eval_results, 'all_loc_evals.json')
# Overall scores
self._compute_avg_over_all()
def _write_to_json(self, data, file_name):
"""
Writes the current object data to a JSON file.
Returns:
bool: True if writing was successful, False otherwise.
"""
try:
output_dir = os.path.join(self.save_dir, 'loc_acc')
os.makedirs(output_dir, exist_ok=True)
filepath = os.path.join(output_dir, file_name)
with open(filepath, 'w') as f:
json.dump(data, f, indent=4)
return True
except Exception as e:
logger.error(f'Error writing to JSON: {str(e)}')
return False
def read_from_json(self, file_path):
"""
Reads data from a JSON file and loads it into the current object.
Returns:
dict: The loaded JSON data, or an empty dict if the file doesn't exist
or an error occurs.
"""
try:
with open(file_path, 'r') as file:
data = json.load(file)
return data
except FileNotFoundError:
logger.warning(
f"Warning: File '{file_path}' not found. Returning an empty dictionary..."
)
return {}
except json.JSONDecodeError:
logger.error(
f"Error: File '{file_path}' contains invalid JSON. Returning an empty dictionary..."
)
return {}
except Exception as e:
logger.error(
f'Error reading from JSON: {str(e)}\nReturning an empty dictionary...'
)
return {}
def read_from_jsonl(self, file_path):
"""
Reads data from a JSON file and loads it into the current object.
Returns:
dict: The loaded JSON data, or an empty dict if the file doesn't exist
or an error occurs.
"""
try:
with open(file_path, 'r') as file:
data = json.load(file)
return data
except FileNotFoundError:
logger.warning(
f"Warning: File '{file_path}' not found. Returning an empty dictionary..."
)
return {}
except json.JSONDecodeError:
logger.error(
f"Error: File '{file_path}' contains invalid JSON. Returning an empty dictionary..."
)
return {}
except Exception as e:
logger.error(
f'Error reading from JSON: {str(e)}\nReturning an empty dictionary...'
)
return {}
def _parse_agent_turn_num(self):
"""Get the max agent turn for current instance"""
history_idx = 1
self.agent_turn_num = 0
while history_idx < len(self.trajectory) - 1:
if (
(self.trajectory[history_idx]['source'] == 'agent')
and ('action' in self.trajectory[history_idx].keys())
and (self.trajectory[history_idx]['action'] != 'system')
):
self.agent_turn_num += 1
history_idx += 1
def _parse_string_to_dict(self, dict_string) -> dict:
"""
Convert a string representation of a dictionary to an actual dictionary.
Args:
dict_string (str): String representation of a dictionary
Returns:
dict or None: The parsed dictionary if successful, None if failed
"""
if not isinstance(dict_string, str):
return None
dict_string = dict_string.strip()
# (1) Try JSON parsing
try:
return json.loads(dict_string)
except (json.JSONDecodeError, ValueError):
pass
# (1) Try ast parsing
try:
result = ast.literal_eval(dict_string)
if isinstance(result, dict):
return result
else:
return None
except (ValueError, SyntaxError):
pass
# If both methods fail, return None
return None
def _parse_value_from_args(self, argument_str: str, key: str) -> str:
"""
Parse a specific key's value from argument string.
Args:
argument_str (str): The argument string containing key-value pairs
key (str): The key to extract (e.g., "path", "new_str", "old_str")
Returns:
str: The extracted value, or empty string if not found
"""
if not isinstance(argument_str, str) or not isinstance(key, str):
return ''
try:
json_pattern = rf'"{re.escape(key)}"\s*:\s*"((?:[^"\\]|\\.)*)"`'
match = re.search(json_pattern, argument_str, re.DOTALL)
if match:
value = match.group(1)
value = (
value.replace('\\"', '"')
.replace('\\n', '\n')
.replace('\\t', '\t')
.replace('\\\\', '\\')
)
return value
python_pattern = rf"'{re.escape(key)}'\s*:\s*'((?:[^'\\]|\\.)*)'"
match = re.search(python_pattern, argument_str, re.DOTALL)
if match:
value = match.group(1)
value = (
value.replace("\\'", "'")
.replace('\\n', '\n')
.replace('\\t', '\t')
.replace('\\\\', '\\')
)
return value
if key in argument_str:
parts = argument_str.split(f'"{key}"', 1)
if len(parts) == 1:
parts = argument_str.split(f"'{key}'", 1)
if len(parts) > 1:
remainder = parts[1].strip()
for quote_char in ['"', "'"]:
pattern = f'\\s*:\\s*{quote_char}((?:[^{quote_char}\\\\]|\\\\.)*)(?:{quote_char}|$)'
match = re.search(pattern, remainder, re.DOTALL)
if match:
value = match.group(1)
if quote_char == '"':
value = (
value.replace('\\"', '"')
.replace('\\n', '\n')
.replace('\\t', '\t')
.replace('\\\\', '\\')
)
else:
value = (
value.replace("\\'", "'")
.replace('\\n', '\n')
.replace('\\t', '\t')
.replace('\\\\', '\\')
)
return value
if key == 'path':
path_pattern = r'/[^\s,}"\']*'
match = re.search(path_pattern, remainder)
if match:
return match.group(0)
return ''
except Exception:
return ''
def _parse_path_from_args(self, argument_str: str) -> str:
"""
Parse path from argument string.
Args:
argument_str (str): The argument string containing path information
Returns:
str: The extracted file path, or empty string if not found
"""
return self._parse_value_from_args(argument_str, 'path')
def _parse_func_names_from_str(self, code_patch) -> list:
"""
Parse function names from the new_str code patch.
Args:
code_patch: Either a string (argument string) or already extracted new_str code
Returns:
list: List of function names found in the code patch
"""
if not code_patch:
return []
try:
# Look for "def function_name(" patterns
# This pattern matches:
# - "def" followed by whitespace
# - function name (letters, numbers, underscores, also handle special methods like __len__)
# - opening parenthesis
func_pattern = r'\bdef\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\('
matches = re.findall(func_pattern, code_patch)
# Remove duplicates while preserving order
seen = set()
unique_funcs = []
for func_name in matches:
if func_name not in seen:
seen.add(func_name)
unique_funcs.append(func_name)
return unique_funcs
except Exception:
return []
def _parse_loc_from_history(self, action_history: dict) -> list:
"""Parse function name and file path"""
if not action_history:
logger.error('No action history provided.')
raise
curr_turn_agent_loc = {}
if action_history['action'] != 'edit':
return curr_turn_agent_loc
agent_msg_list = action_history['tool_call_metadata']['model_response'][
'choices'
]
agent_edit = {
'create': ['file_text'],
'str_replace': ['old_str', 'new_str'],
}
for cho in agent_msg_list:
for func_dict in cho['message']['tool_calls']:
edit_args = func_dict['function']['arguments']
edit_dict = self._parse_string_to_dict(edit_args)
if edit_dict:
curr_command = edit_dict['command']
agent_acts = agent_edit[curr_command]
file_path = edit_dict.get('path', None)
func_names = []
for act in agent_acts:
code_patch = edit_dict.get(act, None)
func_names.extend(self._parse_func_names_from_str(code_patch))
func_names = list(set(func_names))
else:
for new_act in list(agent_edit.values()):
if new_act in edit_args:
agent_acts = new_act
break
file_path = self._parse_path_from_args(edit_args)
func_names = []
for act in agent_acts:
code_patch = edit_args.split(agent_acts)[-1].strip()
func_names.extend(self._parse_func_names_from_str(code_patch))
func_names = list(set(func_names))
if file_path and len(file_path) > 0:
if func_names:
if file_path in curr_turn_agent_loc:
curr_turn_agent_loc[file_path].extend(func_names)
else:
curr_turn_agent_loc[file_path] = func_names
else:
curr_turn_agent_loc[file_path] = []
return curr_turn_agent_loc
def _add_task_success_metric(self) -> bool:
"""Task success evaluation result"""
self.task_resolved = False
report_pth = os.path.join(
self.eval_dir, self.instance.instance_id, 'report.json'
)
eval_report = self.read_from_json(report_pth)
if self.instance.instance_id in eval_report.keys():
self.task_resolved = eval_report[self.instance.instance_id]['resolved']
if self.task_resolved:
return {
'resolved': self.task_resolved,
'resolve_index': self.agent_turn_num,
}
if self.align_failed_with_max_iter:
return {
'resolved': self.task_resolved,
'resolve_index': self.max_agent_turn,
}
else:
return {
'resolved': self.task_resolved,
'resolve_index': self.agent_turn_num,
}
def eval_agent_trajectory(self):
"""Evaluate agent's localization at current state"""
if not self.trajectory:
logger.warning(
f'Inference trajectory for current instance (instance ID: {self.instance.instance_id}) is None, skipping localization evaluation for current instance...'
)
return
# Process history
agent_trajectory = {'final_eval': {}, 'trajectory': {}}
turn_idx = 0
history_idx = 1
while history_idx < len(self.trajectory) - 2:
history_idx += 1
action_history = self.trajectory[history_idx]
observ_history = self.trajectory[history_idx + 1]
# Pass non-agent histories
if (action_history['source'] != 'agent') or (
'action' not in action_history.keys()
):
continue
# Parse action
turn_idx += 1
curr_turn_agent_loc = self._parse_loc_from_history(action_history)
agent_trajectory['trajectory'][f'turn {turn_idx}'] = {
'loc_eval': None,
'loc': curr_turn_agent_loc,
'action': {
'action': action_history['action'],
'message': action_history['message'],
},
'observation': None,
}
if 'observation' in observ_history.keys():
agent_trajectory['trajectory'][f'turn {turn_idx}']['observation'] = {
'observation': observ_history['observation'],
'message': observ_history['message'],
}
# Loc eval
if len(curr_turn_agent_loc) > 0:
for file_key in curr_turn_agent_loc:
for func_name in curr_turn_agent_loc[file_key]:
# File loc
if file_key in self.gold_loc['file']:
if file_key not in self.agent_loc['agent loc']['file']:
self.agent_loc['agent loc']['file'].append(file_key)
self.agent_loc['turn index']['file'][
self.gold_loc['file'].index(file_key)
] = turn_idx
self.agent_loc['loc progress']['file'][
self.gold_loc['file'].index(file_key)
] = True
# Function loc
new_agent_loc = {'file': file_key, 'function': func_name}
if new_agent_loc in self.gold_loc['function']:
if (
new_agent_loc
not in self.agent_loc['agent loc']['function']
):
self.agent_loc['agent loc']['function'].append(
new_agent_loc
)
self.agent_loc['turn index']['function'][
self.gold_loc['function'].index(new_agent_loc)
] = turn_idx
self.agent_loc['loc progress']['function'][
self.gold_loc['function'].index(new_agent_loc)
] = True
agent_trajectory['trajectory'][f'turn {turn_idx}']['loc_eval'] = (
self.agent_loc
)
# Task success
agent_trajectory['final_eval'] = {
'total turn': self.agent_turn_num,
'max turn': self.max_agent_turn,
'localization': {
'loc_acc (%)': {
'la_file (%)': {
'la_file_micro': sum(self.agent_loc['loc progress']['file'])
/ len(self.agent_loc['loc progress']['file'])
* 100,
'la_file_macro': 100.0
if sum(self.agent_loc['loc progress']['file']) > 0
else 0.0,
},
'la_func (%)': {
'la_func_micro': sum(self.agent_loc['loc progress']['function'])
/ len(self.agent_loc['loc progress']['function'])
* 100,
'la_func_macro': 100.0
if sum(self.agent_loc['loc progress']['function']) > 0
else 0.0,
},
},
'turn_idx': {
'file': {
'micro': max(self.agent_loc['turn index']['file']),
'macro': min(self.agent_loc['turn index']['file']),
},
'function': {
'micro': max(self.agent_loc['turn index']['function']),
'macro': min(self.agent_loc['turn index']['function']),
},
},
'details': {
'loc_file': self.agent_loc['loc progress']['file'],
'loc_func': self.agent_loc['loc progress']['function'],
},
},
'task_success': None,
}
# Task success
if self.eval_task_success:
agent_trajectory['final_eval']['task_success'] = (
self._add_task_success_metric()
)
# Align loc with success
if self.task_resolved:
if agent_trajectory['final_eval']['localization']['loc_acc (%)'] != {
'la_file (%)': {'la_file_micro': 100.0, 'la_file_macro': 100.0},
'la_func (%)': {'la_func_micro': 100.0, 'la_func_macro': 100.0},
}:
agent_trajectory['final_eval']['localization']['loc_acc (%)'] = {
'la_file (%)': {'la_file_micro': 100.0, 'la_file_macro': 100.0},
'la_func (%)': {'la_func_micro': 100.0, 'la_func_macro': 100.0},
}
agent_trajectory['final_eval']['localization']['details'] = {
'loc_file': [
True for i in range(len(self.agent_loc['loc progress']['file']))
],
'loc_func': [
True
for i in range(len(self.agent_loc['loc progress']['function']))
],
}
if self.align_failed_with_max_iter:
for level1 in agent_trajectory['final_eval']['localization'][
'turn_idx'
]:
for level2 in agent_trajectory['final_eval']['localization'][
'turn_idx'
][level1]:
if (
agent_trajectory['final_eval']['localization']['turn_idx'][
level1
][level2]
> self.agent_turn_num
):
agent_trajectory['final_eval']['localization']['turn_idx'][
level1
][level2] = self.agent_turn_num
# Save
self._save_to_eval_dicts(agent_trajectory)
def _get_instance_gt_loc(self):
"""Get ground-truth localization for current instance"""
gt_localization = self.localizer.parse_instance_loc(self.instance)
# Convert to dict
gt_loc_dict = gt_localization['patch'].to_dict()
assert gt_loc_dict['instance_id'] == self.instance.instance_id
self.gold_loc = {
'gt_loc_dict': gt_loc_dict['functions'],
'file': [],
'function': [],
}
for file_key in gt_loc_dict['functions']:
if len(gt_loc_dict['functions'][file_key]) == 0:
continue
# File
if file_key not in self.gold_loc['file']:
self.gold_loc['file'].append(f'{self.sandbox_root}/{file_key}')
# Function
for func_name in gt_loc_dict['functions'][file_key]:
new_gt = {
'file': f'{self.sandbox_root}/{file_key}',
'function': func_name,
}
self.gold_loc['function'].append(new_gt)
# Init agent loc accordingly
init_turn = (
self.max_agent_turn
if self.align_failed_with_max_iter
else self.agent_turn_num
)
self.agent_loc['gold loc'] = {
'file': self.gold_loc['file'],
'function': self.gold_loc['function'],
}
self.agent_loc['turn index']['file'] = [
init_turn for i in range(len(self.gold_loc['file']))
]
self.agent_loc['turn index']['function'] = [
init_turn for i in range(len(self.gold_loc['function']))
]
self.agent_loc['loc progress']['file'] = [
False for i in range(len(self.gold_loc['file']))
]
self.agent_loc['loc progress']['function'] = [
False for i in range(len(self.gold_loc['function']))
]
def instance_loc_eval(
self,
instance: pd.Series = None,
repo_root: str = None,
trajectory: list = None,
infer_cost: dict = None,
):
if instance is None:
logger.error(
'No instance provided. Skipping current localization evaluation...'
)
if trajectory is None:
logger.error(
f'No inference trajectory provided for current instance with ID: {instance.instance_id}'
)
if infer_cost is None:
logger.error(
f'No inference accumulated cost for current instance with ID: {instance.instance_id}'
)
# Init
self._init_config()
self.cost_summary['details'][instance.instance_id] = infer_cost
# Update current instance
self.instance = instance
self.trajectory = trajectory
self.sandbox_root = repo_root
# Max turn
self._parse_agent_turn_num()
# GT loc
self._get_instance_gt_loc()
# Loc evaluation
self.eval_agent_trajectory()
def swe_data_loader(args):
"""
Loading SWE-Bench data.
Args:
args: Main arguments.
"""
dataset = load_dataset(args.dataset, split=args.split)
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
logger.info(
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
)
if 'SWE-Gym' in args.dataset:
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'split',
'swegym_verified_instances.json',
),
'r',
) as f:
swegym_verified_instances = json.load(f)
swe_bench_tests = swe_bench_tests[
swe_bench_tests['instance_id'].isin(swegym_verified_instances)
]
logger.info(
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances'
)
instances = prepare_dataset(swe_bench_tests, args.swe_output_file, -1)
return instances
def infer_data_loader(args):
"""
Load instance IDs.
Args:
args: Main arguments.
Returns:
list: A list of instance IDs (strings) extracted from JSON filenames
in the histories directory.
Raises:
FileNotFoundError: If the histories directory doesn't exist.
AttributeError: If args doesn't have a 'infer_dir' attribute.
"""
infer_output_filepath = os.path.join(args.infer_dir, 'output.jsonl')
infer_outputs = []
with open(infer_output_filepath, 'r') as file:
for line_num, line in enumerate(file, 1):
line = line.strip()
if line:
try:
json_obj = json.loads(line)
infer_outputs.append(json_obj)
except json.JSONDecodeError as e:
logger.error(
f"Error parsing JSON on line {line_num} in '{infer_output_filepath}': {str(e)}"
)
continue
return infer_outputs
def infer_cost_calculator(args):
"""
Calculate total and average costs from metric JSON files with detailed output.
Args:
args: Main arguments.
Returns:
dict: A dictionary containing:
- 'total_cost': Sum of all accumulated costs
- 'average_cost': Average cost per JSON file
- 'file_count': Number of JSON files processed
- 'individual_costs': List of individual costs (optional)
"""
metrics_dir = os.path.join(args.infer_dir, 'metrics')
if not os.path.exists(metrics_dir):
raise FileNotFoundError(f'Metrics directory not found: {metrics_dir}')
individual_costs = []
for filename in os.listdir(metrics_dir):
if filename.endswith('.json'):
file_path = os.path.join(metrics_dir, filename)
try:
with open(file_path, 'r', encoding='utf-8') as file:
metric_data = json.load(file)
if 'accumulated_cost' not in metric_data:
raise KeyError(f"'accumulated_cost' not found in {filename}")
cost = float(metric_data['accumulated_cost'])
individual_costs.append(cost)
except (json.JSONDecodeError, ValueError, TypeError, IOError) as e:
logger.warning(f'Warning: Error processing {filename}: {e}')
continue
if not individual_costs:
raise ValueError('No valid JSON files found in the metrics directory')
total_cost = sum(individual_costs)
average_cost = total_cost / len(individual_costs)
return {
'total_cost': total_cost,
'average_cost': average_cost,
'file_count': len(individual_costs),
'individual_costs': individual_costs,
}
if __name__ == '__main__':
"""Main function for localization evaluation"""
parser = argparse.ArgumentParser(
description='Localization evaluation on SWE-Bench.'
)
parser.add_argument(
'--infer-dir',
type=str,
default=None,
help='Directory containing model inference outputs',
)
parser.add_argument(
'--dataset', type=str, default=None, help='SWE-Bench dataset version'
)
parser.add_argument(
'--split', type=str, default=None, help='SWE-Bench dataset split selection'
)
parser.add_argument(
'--max-infer-turn',
type=int,
default=None,
help='Max number of turns allowed for coding agent.',
)
parser.add_argument(
'--align-with-max',
type=str,
choices=['true', 'false'],
default='true',
help='Whether to align failed instances with max iteration count (true/false)',
)
args = parser.parse_args()
# Convert args.align_with_max str to bool
args.align_with_max = args.align_with_max.lower() == 'true'
# Eval infer and loc
args.save_dir = f'{args.infer_dir}/loc_eval'
os.makedirs(args.save_dir, exist_ok=True)
args.eval_dir = f'{args.infer_dir}/eval_outputs'
if not os.path.isdir(args.eval_dir):
args.eval_dir = None
# SWE-Bench
args.swe_output_file = os.path.join(args.save_dir, 'swe_dataset.json')
# Load swebench data
swe_instances = swe_data_loader(args)
# Load inference data
infer_outputs = infer_data_loader(args)
# Loc eval
processed_instances = []
loc_eval_results = {}
loc_evaluator = LocEvaluator(args)
for infer_idx, infer_instance in tqdm(
enumerate(infer_outputs), total=len(infer_outputs), desc='Processing instances'
):
instance_id = infer_instance['instance_id']
swe_instance = swe_instances.query(f"instance_id == '{instance_id}'").iloc[0]
assert instance_id == swe_instance.instance_id
processed_instances.append(instance_id)
upload_instruction = infer_instance['instruction']
repo_root = (
upload_instruction.split('<uploaded_files>')[1]
.split('</uploaded_files>')[0]
.strip()
)
curr_trajectory = infer_instance['history']
curr_cost = infer_instance['metrics']['accumulated_cost']
loc_evaluator.instance_loc_eval(
swe_instance, repo_root, curr_trajectory, curr_cost
)
logger.info(
f'\n[Inference Data Summary]'
f'\n{" " * 4} - Total cost: $ {loc_evaluator.cost_summary["total_cost"]}'
f'\n{" " * 4} - Average cost: $ {loc_evaluator.cost_summary["avg_cost"]}'
f'\n{" " * 4} - Number of Instances: {len(processed_instances)}'
)