mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 07:18:10 -05:00
chore(lint): Apply comprehensive linting and formatting fixes (#10287)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
@@ -506,7 +506,6 @@ def commit0_setup(dataset: pd.DataFrame, repo_split: str) -> pd.DataFrame:
|
||||
Returns:
|
||||
Filtered dataset based on split type
|
||||
"""
|
||||
|
||||
filtered_dataset = pd.concat(
|
||||
[
|
||||
dataset[dataset['repo'].str.split('/').str[1] == repo]
|
||||
|
||||
@@ -89,8 +89,7 @@ def get_config(
|
||||
def get_dv_query_for_real(
|
||||
datasets, question, domain_knowledge=None, workflow_tags=None
|
||||
):
|
||||
"""
|
||||
Prepare a structured query for the agent to execute on the specified datasets.
|
||||
"""Prepare a structured query for the agent to execute on the specified datasets.
|
||||
|
||||
This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags.
|
||||
|
||||
@@ -104,7 +103,6 @@ def get_dv_query_for_real(
|
||||
query_to_dv: Query to be run on the dataset
|
||||
dataset_meta: Metadata of the dataset
|
||||
"""
|
||||
|
||||
dataset_meta = ''
|
||||
for dataset_metadata in datasets:
|
||||
dataset_meta += 'Dataset name: ' + dataset_metadata['name']
|
||||
@@ -140,8 +138,7 @@ def get_dv_query_for_real(
|
||||
|
||||
|
||||
def initialize_runtime(runtime: Runtime, data_files: list[str]):
|
||||
"""
|
||||
Initialize the runtime for the agent.
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
@@ -231,8 +228,7 @@ def process_instance(
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
"""
|
||||
Process and evaluate a single instance of the dataset.
|
||||
"""Process and evaluate a single instance of the dataset.
|
||||
|
||||
This function executes the OpenHands agent
|
||||
for a specific instance of the dataset. It retrieves
|
||||
@@ -247,7 +243,6 @@ def process_instance(
|
||||
Returns:
|
||||
output: EvalOutput object
|
||||
"""
|
||||
|
||||
config = get_config(metadata)
|
||||
|
||||
# Setup the logger properly, so you can run
|
||||
@@ -356,8 +351,7 @@ def list_csv_files(list_of_datasets):
|
||||
|
||||
|
||||
def create_dataset(repo_location: str, split: str = 'test'):
|
||||
"""
|
||||
Create a dataset from the discoverybench repository
|
||||
"""Create a dataset from the discoverybench repository
|
||||
by walking through the repository and extracting metadata
|
||||
from the metadata_{}.json files
|
||||
|
||||
@@ -368,7 +362,6 @@ def create_dataset(repo_location: str, split: str = 'test'):
|
||||
Returns:
|
||||
df: DataFrame containing the dataset instances
|
||||
"""
|
||||
|
||||
data_dict = {}
|
||||
|
||||
data_location = os.path.join(repo_location, 'discoverybench', 'real', split)
|
||||
|
||||
@@ -105,8 +105,7 @@ def process_instance(
|
||||
log_dir: str | None = None,
|
||||
runtime_failure_count: int = 0,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a SWE-bench problem instance.
|
||||
"""Evaluate agent performance on a SWE-bench problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
"""
|
||||
Utilities for handling binary files and patch generation in SWE-bench evaluation.
|
||||
"""
|
||||
"""Utilities for handling binary files and patch generation in SWE-bench evaluation."""
|
||||
|
||||
|
||||
def remove_binary_diffs(patch_text):
|
||||
"""
|
||||
Remove binary file diffs from a git patch.
|
||||
"""Remove binary file diffs from a git patch.
|
||||
|
||||
Args:
|
||||
patch_text (str): The git patch text
|
||||
@@ -36,8 +33,7 @@ def remove_binary_diffs(patch_text):
|
||||
|
||||
|
||||
def remove_binary_files_from_git():
|
||||
"""
|
||||
Generate a bash command to remove binary files from git staging.
|
||||
"""Generate a bash command to remove binary files from git staging.
|
||||
|
||||
Returns:
|
||||
str: A bash command that removes binary files from git staging
|
||||
|
||||
@@ -111,8 +111,7 @@ def process_instance(
|
||||
runtime_failure_count: int = 0,
|
||||
conditional_imports: ConditionalImports | None = None,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a SWE-bench problem instance.
|
||||
"""Evaluate agent performance on a SWE-bench problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
|
||||
@@ -16,8 +16,7 @@ from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
class LocEvaluator:
|
||||
def __init__(self, args):
|
||||
"""
|
||||
Localization evaluation.
|
||||
"""Localization evaluation.
|
||||
|
||||
Args:
|
||||
args: all main arguments
|
||||
@@ -76,8 +75,7 @@ class LocEvaluator:
|
||||
self.task_resolved = False
|
||||
|
||||
def _init_dir(self, directory_path):
|
||||
"""
|
||||
Check if a directory exists and create it if it doesn't.
|
||||
"""Check if a directory exists and create it if it doesn't.
|
||||
|
||||
Args:
|
||||
directory_path (str): Path to the directory to check/create
|
||||
@@ -207,8 +205,7 @@ class LocEvaluator:
|
||||
self._compute_avg_over_all()
|
||||
|
||||
def _write_to_json(self, data, file_name):
|
||||
"""
|
||||
Writes the current object data to a JSON file.
|
||||
"""Writes the current object data to a JSON file.
|
||||
|
||||
Returns:
|
||||
bool: True if writing was successful, False otherwise.
|
||||
@@ -225,8 +222,7 @@ class LocEvaluator:
|
||||
return False
|
||||
|
||||
def read_from_json(self, file_path):
|
||||
"""
|
||||
Reads data from a JSON file and loads it into the current object.
|
||||
"""Reads data from a JSON file and loads it into the current object.
|
||||
|
||||
Returns:
|
||||
dict: The loaded JSON data, or an empty dict if the file doesn't exist
|
||||
@@ -253,8 +249,7 @@ class LocEvaluator:
|
||||
return {}
|
||||
|
||||
def read_from_jsonl(self, file_path):
|
||||
"""
|
||||
Reads data from a JSON file and loads it into the current object.
|
||||
"""Reads data from a JSON file and loads it into the current object.
|
||||
|
||||
Returns:
|
||||
dict: The loaded JSON data, or an empty dict if the file doesn't exist
|
||||
@@ -294,8 +289,7 @@ class LocEvaluator:
|
||||
history_idx += 1
|
||||
|
||||
def _parse_string_to_dict(self, dict_string) -> dict:
|
||||
"""
|
||||
Convert a string representation of a dictionary to an actual dictionary.
|
||||
"""Convert a string representation of a dictionary to an actual dictionary.
|
||||
|
||||
Args:
|
||||
dict_string (str): String representation of a dictionary
|
||||
@@ -328,8 +322,7 @@ class LocEvaluator:
|
||||
return None
|
||||
|
||||
def _parse_value_from_args(self, argument_str: str, key: str) -> str:
|
||||
"""
|
||||
Parse a specific key's value from argument string.
|
||||
"""Parse a specific key's value from argument string.
|
||||
|
||||
Args:
|
||||
argument_str (str): The argument string containing key-value pairs
|
||||
@@ -407,8 +400,7 @@ class LocEvaluator:
|
||||
return ''
|
||||
|
||||
def _parse_path_from_args(self, argument_str: str) -> str:
|
||||
"""
|
||||
Parse path from argument string.
|
||||
"""Parse path from argument string.
|
||||
|
||||
Args:
|
||||
argument_str (str): The argument string containing path information
|
||||
@@ -419,8 +411,7 @@ class LocEvaluator:
|
||||
return self._parse_value_from_args(argument_str, 'path')
|
||||
|
||||
def _parse_func_names_from_str(self, code_patch) -> list:
|
||||
"""
|
||||
Parse function names from the new_str code patch.
|
||||
"""Parse function names from the new_str code patch.
|
||||
|
||||
Args:
|
||||
code_patch: Either a string (argument string) or already extracted new_str code
|
||||
@@ -801,8 +792,7 @@ class LocEvaluator:
|
||||
|
||||
|
||||
def swe_data_loader(args):
|
||||
"""
|
||||
Loading SWE-Bench data.
|
||||
"""Loading SWE-Bench data.
|
||||
|
||||
Args:
|
||||
args: Main arguments.
|
||||
@@ -834,8 +824,7 @@ def swe_data_loader(args):
|
||||
|
||||
|
||||
def infer_data_loader(args):
|
||||
"""
|
||||
Load instance IDs.
|
||||
"""Load instance IDs.
|
||||
|
||||
Args:
|
||||
args: Main arguments.
|
||||
@@ -868,8 +857,7 @@ def infer_data_loader(args):
|
||||
|
||||
|
||||
def infer_cost_calculator(args):
|
||||
"""
|
||||
Calculate total and average costs from metric JSON files with detailed output.
|
||||
"""Calculate total and average costs from metric JSON files with detailed output.
|
||||
|
||||
Args:
|
||||
args: Main arguments.
|
||||
|
||||
@@ -28,8 +28,7 @@ class LocalizationInfo:
|
||||
hunks_per_file: dict[str, int] # File -> number of hunks
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""
|
||||
Convert LocalizationInfo to a dictionary for JSON serialization.
|
||||
"""Convert LocalizationInfo to a dictionary for JSON serialization.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of the localization information
|
||||
@@ -58,8 +57,7 @@ class LocalizationInfo:
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> 'LocalizationInfo':
|
||||
"""
|
||||
Create LocalizationInfo from a dictionary (for loading from JSON).
|
||||
"""Create LocalizationInfo from a dictionary (for loading from JSON).
|
||||
|
||||
Args:
|
||||
data: Dictionary containing localization information
|
||||
@@ -91,8 +89,7 @@ class LocalizationInfo:
|
||||
|
||||
|
||||
class LocMeta:
|
||||
"""
|
||||
SWE-Bench dataset loader and ground-truth localization parser.
|
||||
"""SWE-Bench dataset loader and ground-truth localization parser.
|
||||
|
||||
This class handles loading SWE-Bench datasets and extracting ground-truth
|
||||
localization information from patches for code localization evaluation.
|
||||
@@ -104,8 +101,7 @@ class LocMeta:
|
||||
dataset_name: str = 'princeton-nlp/SWE-bench_Verified',
|
||||
split: str = 'test',
|
||||
):
|
||||
"""
|
||||
Initialize LocMeta with a SWE-Bench dataset.
|
||||
"""Initialize LocMeta with a SWE-Bench dataset.
|
||||
|
||||
Args:
|
||||
dataset_name: HuggingFace dataset name (e.g., "princeton-nlp/SWE-bench_Verified")
|
||||
@@ -124,8 +120,7 @@ class LocMeta:
|
||||
self._init_swe_dataset()
|
||||
|
||||
def _init_swe_dataset(self) -> None:
|
||||
"""
|
||||
Load and initialize the SWE-Bench dataset from HuggingFace.
|
||||
"""Load and initialize the SWE-Bench dataset from HuggingFace.
|
||||
Converts to pandas DataFrame for easy manipulation.
|
||||
"""
|
||||
try:
|
||||
@@ -150,8 +145,7 @@ class LocMeta:
|
||||
raise
|
||||
|
||||
def get_instance_by_id(self, instance_id: str) -> pd.Series:
|
||||
"""
|
||||
Retrieve a specific instance by its ID.
|
||||
"""Retrieve a specific instance by its ID.
|
||||
|
||||
Args:
|
||||
instance_id: The instance identifier
|
||||
@@ -169,8 +163,7 @@ class LocMeta:
|
||||
return self.df.iloc[idx]
|
||||
|
||||
def parse_instance_loc(self, instance: Union[pd.Series, str]) -> LocalizationInfo:
|
||||
"""
|
||||
Parse ground-truth localization information from a SWE-Bench instance.
|
||||
"""Parse ground-truth localization information from a SWE-Bench instance.
|
||||
|
||||
Args:
|
||||
instance: Either a pandas Series with instance data or an instance_id string
|
||||
@@ -218,8 +211,7 @@ class LocMeta:
|
||||
def _parse_file_patch_lines(
|
||||
self, file_patch: str
|
||||
) -> tuple[list[tuple[int, int]], int, int]:
|
||||
"""
|
||||
Parse line ranges and count changes from a single file patch.
|
||||
"""Parse line ranges and count changes from a single file patch.
|
||||
|
||||
Args:
|
||||
file_patch: Patch content for a single file
|
||||
@@ -253,8 +245,7 @@ class LocMeta:
|
||||
def _parse_code_structures_from_patch(
|
||||
self, file_patch: str, file_path: str
|
||||
) -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
Extract function and class names from patch context (fallback method).
|
||||
"""Extract function and class names from patch context (fallback method).
|
||||
|
||||
Args:
|
||||
file_patch: Patch content for a single file
|
||||
@@ -311,8 +302,7 @@ class LocMeta:
|
||||
def _parse_patch_localization(
|
||||
self, patch_content: str, instance_id: str
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Parse localization information from a git patch (improved method).
|
||||
"""Parse localization information from a git patch (improved method).
|
||||
|
||||
Args:
|
||||
patch_content: The git patch content
|
||||
@@ -390,8 +380,7 @@ class LocMeta:
|
||||
def _extract_code_structures_from_patch(
|
||||
self, file_patch: str, file_path: str
|
||||
) -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
Extract function and class names from patch context and content.
|
||||
"""Extract function and class names from patch context and content.
|
||||
|
||||
Args:
|
||||
file_patch: Patch content for a single file
|
||||
@@ -519,8 +508,7 @@ class LocMeta:
|
||||
def _parse_patch_localization_with_runtime(
|
||||
self, patch_content: str, instance_id: str, runtime: Runtime
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Parse localization information from a git patch using OpenHands runtime.
|
||||
"""Parse localization information from a git patch using OpenHands runtime.
|
||||
This is the superior method when runtime is available.
|
||||
|
||||
Args:
|
||||
@@ -596,8 +584,7 @@ class LocMeta:
|
||||
def parse_instance_loc_with_runtime(
|
||||
self, instance: Union[pd.Series, str], runtime: Runtime = None
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Parse ground-truth localization information using OpenHands runtime.
|
||||
"""Parse ground-truth localization information using OpenHands runtime.
|
||||
|
||||
Args:
|
||||
instance: Either a pandas Series with instance data or an instance_id string
|
||||
@@ -634,8 +621,7 @@ class LocMeta:
|
||||
def _analyze_source_code_with_runtime(
|
||||
self, runtime: Runtime, file_path: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Analyze source code using OpenHands runtime to find functions and classes.
|
||||
"""Analyze source code using OpenHands runtime to find functions and classes.
|
||||
|
||||
Args:
|
||||
runtime: OpenHands runtime object
|
||||
@@ -695,8 +681,7 @@ class LocMeta:
|
||||
def _parse_cython_content_with_line_mapping(
|
||||
self, content: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Parse Cython content to extract functions and classes with line mapping.
|
||||
"""Parse Cython content to extract functions and classes with line mapping.
|
||||
Since Cython files can't be parsed with Python's AST, we use regex-based parsing.
|
||||
|
||||
Args:
|
||||
@@ -828,8 +813,7 @@ class LocMeta:
|
||||
def _parse_python_content_with_line_mapping(
|
||||
self, content: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Parse Python content to extract functions and classes with accurate line mapping.
|
||||
"""Parse Python content to extract functions and classes with accurate line mapping.
|
||||
|
||||
Args:
|
||||
content: Python source code content
|
||||
@@ -914,8 +898,7 @@ class LocMeta:
|
||||
def _parse_python_content(
|
||||
self, content: str, affected_lines: list[int]
|
||||
) -> tuple[list[str], list[str], dict[int, str], dict[int, str]]:
|
||||
"""
|
||||
Parse Python content to extract functions and classes.
|
||||
"""Parse Python content to extract functions and classes.
|
||||
|
||||
Args:
|
||||
content: Python source code content
|
||||
@@ -989,8 +972,7 @@ class LocMeta:
|
||||
return [], [], {}, {}
|
||||
|
||||
def _split_patch_by_files(self, patch_content: str) -> dict[str, str]:
|
||||
"""
|
||||
Split a multi-file patch into individual file patches.
|
||||
"""Split a multi-file patch into individual file patches.
|
||||
|
||||
Args:
|
||||
patch_content: Complete patch content
|
||||
@@ -1049,8 +1031,7 @@ class LocMeta:
|
||||
def _empty_localization_info(
|
||||
self, instance_id: str = 'unknown'
|
||||
) -> LocalizationInfo:
|
||||
"""
|
||||
Return an empty LocalizationInfo object.
|
||||
"""Return an empty LocalizationInfo object.
|
||||
|
||||
Args:
|
||||
instance_id: Instance identifier
|
||||
@@ -1072,8 +1053,7 @@ class LocMeta:
|
||||
)
|
||||
|
||||
def get_dataset_statistics(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get statistics about the loaded dataset.
|
||||
"""Get statistics about the loaded dataset.
|
||||
|
||||
Returns:
|
||||
Dictionary containing dataset statistics
|
||||
@@ -1095,8 +1075,7 @@ class LocMeta:
|
||||
return stats
|
||||
|
||||
def get_instances_by_repo(self, repo_name: str) -> pd.DataFrame:
|
||||
"""
|
||||
Get all instances for a specific repository.
|
||||
"""Get all instances for a specific repository.
|
||||
|
||||
Args:
|
||||
repo_name: Repository name (e.g., "django/django")
|
||||
|
||||
@@ -6,8 +6,7 @@ from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
|
||||
def verify_instance_costs(row: pd.Series) -> float:
|
||||
"""
|
||||
Verifies that the accumulated_cost matches the sum of individual costs in metrics.
|
||||
"""Verifies that the accumulated_cost matches the sum of individual costs in metrics.
|
||||
Also checks for duplicate consecutive costs which might indicate buggy counting.
|
||||
If the consecutive costs are identical, the file is affected by this bug:
|
||||
https://github.com/All-Hands-AI/OpenHands/issues/5383
|
||||
|
||||
@@ -181,9 +181,7 @@ def distinct_methods_stats(tree, num_lines):
|
||||
|
||||
|
||||
def loops_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of loops.
|
||||
"""
|
||||
"""Calculate the average number of loops."""
|
||||
total_loops = 0
|
||||
|
||||
def traverse(node):
|
||||
@@ -199,9 +197,7 @@ def loops_stats(tree, num_lines):
|
||||
|
||||
|
||||
def branches_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of branches (conditional statements).
|
||||
"""
|
||||
"""Calculate the average number of branches (conditional statements)."""
|
||||
total_branches = 0
|
||||
|
||||
def traverse(node):
|
||||
|
||||
@@ -192,8 +192,7 @@ def run_mutation_testing(
|
||||
def grade_test_output(
|
||||
test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime
|
||||
):
|
||||
"""
|
||||
Two-pass test grading with short-circuiting:
|
||||
"""Two-pass test grading with short-circuiting:
|
||||
1. Run all tests to identify passing/failing tests
|
||||
2. If no failing tests, evaluate coverage immediately
|
||||
3. Otherwise, run only passing tests for coverage analysis
|
||||
@@ -280,8 +279,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
log_dir: str | None = None,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a TestGenEval problem instance.
|
||||
"""Evaluate agent performance on a TestGenEval problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
@@ -453,8 +451,7 @@ def process_instance(
|
||||
|
||||
|
||||
def count_and_log_fields(evaluated_predictions, fields, key):
|
||||
"""
|
||||
Count and log the sum of specified fields in the evaluated predictions,
|
||||
"""Count and log the sum of specified fields in the evaluated predictions,
|
||||
ignoring fields with a value of -1. If all values for a field are -1,
|
||||
return -1.
|
||||
|
||||
|
||||
@@ -4,8 +4,7 @@ from evaluation.benchmarks.testgeneval.constants import TestStatus
|
||||
|
||||
|
||||
def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
"""Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -26,8 +25,7 @@ def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework with options
|
||||
"""Parser for test logs generated with PyTest framework with options
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -61,8 +59,7 @@ def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_django(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Django tester framework
|
||||
"""Parser for test logs generated with Django tester framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -141,8 +138,7 @@ def parse_log_django(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework (Later Version)
|
||||
"""Parser for test logs generated with PyTest framework (Later Version)
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -170,8 +166,7 @@ def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with seaborn testing framework
|
||||
"""Parser for test logs generated with seaborn testing framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -196,8 +191,7 @@ def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Sympy framework
|
||||
"""Parser for test logs generated with Sympy framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -229,8 +223,7 @@ def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_matplotlib(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
"""Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
|
||||
@@ -12,8 +12,7 @@ if sys.getrecursionlimit() < 10_000:
|
||||
|
||||
|
||||
def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
"""
|
||||
Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
"""Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
:param pred: list of predicted tokens
|
||||
@@ -30,8 +29,7 @@ def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
|
||||
|
||||
def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
"""
|
||||
Calculate BLEU score for a batch of sentences.
|
||||
"""Calculate BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -43,8 +41,7 @@ def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
|
||||
|
||||
def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate corpus-level BLEU score for a batch of sentences.
|
||||
"""Calculate corpus-level BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -63,8 +60,7 @@ def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
def edit_sim(
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> float:
|
||||
"""
|
||||
Calculate char-level edit similarity, in the range of 0~100.
|
||||
"""Calculate char-level edit similarity, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -85,8 +81,7 @@ def batch_edit_sim(
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate char-level edit similarity for a batch of sentences.
|
||||
"""Calculate char-level edit similarity for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -102,8 +97,7 @@ T = TypeVar('T')
|
||||
|
||||
|
||||
def exact_match(gold: T, pred: T) -> float:
|
||||
"""
|
||||
Calculate exact match accuracy, in the range of {0, 100}.
|
||||
"""Calculate exact match accuracy, in the range of {0, 100}.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -115,8 +109,7 @@ def exact_match(gold: T, pred: T) -> float:
|
||||
|
||||
|
||||
def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
"""
|
||||
Calculate exact match accuracy for a batch of sentences.
|
||||
"""Calculate exact match accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -130,8 +123,7 @@ def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
def rouge_l(
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
"""Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -156,8 +148,7 @@ def batch_rouge_l(
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> dict[str, list[float]]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
"""Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -175,8 +166,7 @@ def accuracy(
|
||||
pred: list[str],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate token-level accuracy, in the range of 0~100.
|
||||
"""Calculate token-level accuracy, in the range of 0~100.
|
||||
If gold and pred are not the same length, the longer one would be truncated.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
@@ -210,8 +200,7 @@ def batch_accuracy(
|
||||
preds: list[list[str]],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate token-level accuracy for a batch of sentences.
|
||||
"""Calculate token-level accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -226,8 +215,7 @@ def batch_accuracy(
|
||||
def first_match_to_topk(
|
||||
first_match_list: list[int], k_values: list[int]
|
||||
) -> dict[int, list[float]]:
|
||||
"""
|
||||
Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
"""Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
|
||||
:param first_match: first match ranks (1-indexed)
|
||||
:param k_values: k values to consider
|
||||
@@ -237,8 +225,7 @@ def first_match_to_topk(
|
||||
|
||||
|
||||
def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
"""
|
||||
Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
"""Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
:param n: total number of samples
|
||||
:param c: number of correct samples
|
||||
:param k: k in pass@$k$
|
||||
@@ -251,8 +238,7 @@ def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
|
||||
|
||||
def self_bleu(samples: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate self-BLEU among the samples.
|
||||
"""Calculate self-BLEU among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:return: self-BLEU
|
||||
"""
|
||||
@@ -274,8 +260,7 @@ def self_bleu(samples: list[list[str]]) -> float:
|
||||
|
||||
|
||||
def self_edit_distance(samples: list[Union[str, list[str]]], sep=' ') -> float:
|
||||
"""
|
||||
Calculate self-edit-distance among the samples.
|
||||
"""Calculate self-edit-distance among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:param sep: the separator between tokens
|
||||
:return: self-edit-distance
|
||||
|
||||
@@ -30,8 +30,7 @@ def check_mutation(mutation_output):
|
||||
|
||||
|
||||
def count_methods(code_str):
|
||||
"""
|
||||
Counts the number of methods/functions in a given string of code.
|
||||
"""Counts the number of methods/functions in a given string of code.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
@@ -46,8 +45,7 @@ def count_methods(code_str):
|
||||
|
||||
|
||||
def get_lines_of_code(code_str):
|
||||
"""
|
||||
Extracts lines of code from a given string.
|
||||
"""Extracts lines of code from a given string.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
|
||||
@@ -7,8 +7,7 @@ import traceback
|
||||
|
||||
|
||||
def insert_line_in_string(input_string, new_str, insert_line):
|
||||
"""
|
||||
Inserts a new line into a string at the specified line number.
|
||||
"""Inserts a new line into a string at the specified line number.
|
||||
|
||||
:param input_string: The original string.
|
||||
:param new_str: The string to insert.
|
||||
@@ -29,8 +28,7 @@ def insert_line_in_string(input_string, new_str, insert_line):
|
||||
|
||||
|
||||
def print_string_diff(original, modified):
|
||||
"""
|
||||
Prints the differences between two strings line by line.
|
||||
"""Prints the differences between two strings line by line.
|
||||
|
||||
:param original: The original string.
|
||||
:param modified: The modified string.
|
||||
|
||||
@@ -37,8 +37,7 @@ def extract_preamble_classes_and_functions(code):
|
||||
current_position = 0
|
||||
|
||||
def extract_class_body(code: str, start_index: int) -> tuple[str, int]:
|
||||
"""
|
||||
Extracts the body of a class from the given code starting from the specified index.
|
||||
"""Extracts the body of a class from the given code starting from the specified index.
|
||||
Returns the class body and the end index of the class body.
|
||||
"""
|
||||
if not code or start_index < 0 or start_index >= len(code):
|
||||
@@ -168,8 +167,8 @@ def extract_preamble_classes_and_functions(code):
|
||||
def filter_passing_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests based on their execution results.
|
||||
"""Filter tests based on their execution results.
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Modified test content with only passing tests
|
||||
@@ -246,8 +245,7 @@ def filter_passing_tests(
|
||||
def filter_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
"""Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
|
||||
are preserved.
|
||||
|
||||
|
||||
@@ -20,9 +20,7 @@ DIFF_MODIFIED_FILE_REGEX = r'--- a/(.*)'
|
||||
|
||||
@dataclass
|
||||
class TestSpec:
|
||||
"""
|
||||
A dataclass that represents a test specification for a single instance of SWE-bench.
|
||||
"""
|
||||
"""A dataclass that represents a test specification for a single instance of SWE-bench."""
|
||||
|
||||
instance_id: str
|
||||
id: str
|
||||
@@ -86,10 +84,7 @@ def make_test_setup(specs, env_name, repo_directory, includes_tox=False):
|
||||
|
||||
|
||||
def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
"""Runs the tests."""
|
||||
includes_tox = 'tox' in test_cmd
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory, includes_tox)
|
||||
eval_commands += [
|
||||
@@ -104,10 +99,7 @@ def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
|
||||
|
||||
def make_mutation_script_list(specs, env_name, repo_directory, mutation_timeout):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
"""Runs the tests."""
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory)
|
||||
eval_commands += [
|
||||
'cosmic-ray init mutation.toml mutation.sqlite',
|
||||
|
||||
@@ -11,8 +11,7 @@ from evaluation.benchmarks.testgeneval.constants import (
|
||||
|
||||
|
||||
def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
"""
|
||||
Get test directives from the test_patch of a task instance
|
||||
"""Get test directives from the test_patch of a task instance
|
||||
|
||||
Args:
|
||||
instance (dict): task instance
|
||||
@@ -43,9 +42,7 @@ def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
def load_testgeneval_dataset(
|
||||
name='kjain14/testgeneval', split='test', ids=None
|
||||
) -> list[TestGenEvalInstance]:
|
||||
"""
|
||||
Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
|
||||
"""
|
||||
"""Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file"""
|
||||
# check that all instance IDs are in the dataset
|
||||
if ids:
|
||||
ids = set(ids)
|
||||
|
||||
@@ -24,9 +24,7 @@ class ActionType(Enum):
|
||||
|
||||
@dataclass
|
||||
class Selector:
|
||||
"""
|
||||
Represents either a direct anchor ID or a descriptive selector
|
||||
"""
|
||||
"""Represents either a direct anchor ID or a descriptive selector"""
|
||||
|
||||
value: str
|
||||
is_anchor: bool = False
|
||||
@@ -149,8 +147,7 @@ def find_matching_anchor(content: str, selector: str) -> str | None:
|
||||
|
||||
|
||||
def resolve_action(action: BrowserAction, content: str) -> BrowserAction:
|
||||
"""
|
||||
Resolve any descriptive selectors in the action to anchor IDs based on the content.
|
||||
"""Resolve any descriptive selectors in the action to anchor IDs based on the content.
|
||||
Returns a new action with resolved selectors.
|
||||
"""
|
||||
if isinstance(action, (InputAction, ClickAction)):
|
||||
@@ -174,8 +171,7 @@ def pre_login(
|
||||
save_screenshots=True,
|
||||
screenshots_dir='screenshots',
|
||||
):
|
||||
"""
|
||||
Logs in to all the websites that are needed for the evaluation.
|
||||
"""Logs in to all the websites that are needed for the evaluation.
|
||||
Once logged in, the sessions would be cached in the browser, so OpenHands
|
||||
agent doesn't need to log in to these websites again.
|
||||
"""
|
||||
|
||||
@@ -68,8 +68,7 @@ def get_config(
|
||||
|
||||
|
||||
def load_dependencies(runtime: Runtime) -> list[str]:
|
||||
"""
|
||||
Every task has a dependencies.yml file, which lists all the services that the
|
||||
"""Every task has a dependencies.yml file, which lists all the services that the
|
||||
task depends on. This function loads the file and returns all dependent service names.
|
||||
"""
|
||||
command = 'cat /utils/dependencies.yml'
|
||||
|
||||
@@ -11,9 +11,7 @@ import sys
|
||||
|
||||
|
||||
def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
||||
"""
|
||||
Calculate the cost of the model call.
|
||||
"""
|
||||
"""Calculate the cost of the model call."""
|
||||
if 'claude-3-5-sonnet' in model.lower():
|
||||
# https://www.anthropic.com/pricing#anthropic-api, accessed 12/11/2024
|
||||
return 0.000003 * prompt_tokens + 0.000015 * completion_tokens
|
||||
@@ -60,8 +58,7 @@ def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> fl
|
||||
|
||||
|
||||
def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
|
||||
"""
|
||||
Analyze a single eval JSON file and extract the total and result from final_score.
|
||||
"""Analyze a single eval JSON file and extract the total and result from final_score.
|
||||
|
||||
Args:
|
||||
filepath: Path to the JSON file
|
||||
@@ -84,8 +81,7 @@ def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
|
||||
|
||||
|
||||
def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
|
||||
"""
|
||||
Analyze a single trajectory JSON file and extract the steps and tokens
|
||||
"""Analyze a single trajectory JSON file and extract the steps and tokens
|
||||
for each step. Then estimate the cost based on the tokens and the model type.
|
||||
Note: this is assuming there's no prompt caching at all.
|
||||
"""
|
||||
@@ -115,8 +111,7 @@ def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
|
||||
def analyze_folder(
|
||||
folder_path: str,
|
||||
) -> tuple[dict[str, tuple[int, int]], dict[str, tuple[int, float]]]:
|
||||
"""
|
||||
Analyze all eval_*.json & traj_*.json files in the specified folder.
|
||||
"""Analyze all eval_*.json & traj_*.json files in the specified folder.
|
||||
|
||||
Args:
|
||||
folder_path: Path to the folder containing JSON files
|
||||
@@ -148,9 +143,7 @@ def analyze_folder(
|
||||
|
||||
|
||||
def get_task_nature_category(task_name: str) -> str:
|
||||
"""
|
||||
Get the nature category of the task.
|
||||
"""
|
||||
"""Get the nature category of the task."""
|
||||
task_nature = task_name.split('-')[0]
|
||||
if task_nature.lower() in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance']:
|
||||
return task_nature
|
||||
@@ -159,8 +152,7 @@ def get_task_nature_category(task_name: str) -> str:
|
||||
|
||||
|
||||
def calculate_score(total: int, result: int) -> float:
|
||||
"""
|
||||
Calculate the score as a number between 0 and 1.
|
||||
"""Calculate the score as a number between 0 and 1.
|
||||
|
||||
Formula: score = (result / total) * 0.5 + (result // total) * 0.5
|
||||
Explanation:
|
||||
@@ -178,8 +170,7 @@ def calculate_score(total: int, result: int) -> float:
|
||||
|
||||
|
||||
def is_perfect_completion(total: int, result: int) -> bool:
|
||||
"""
|
||||
Check if the task achieved perfect completion.
|
||||
"""Check if the task achieved perfect completion.
|
||||
|
||||
Args:
|
||||
total: Total possible points
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
GPT performs line level generation prediction and truncates overly long tokens
|
||||
"""
|
||||
"""GPT performs line level generation prediction and truncates overly long tokens"""
|
||||
|
||||
import json
|
||||
import os
|
||||
@@ -56,8 +54,7 @@ def predict(content, model_name):
|
||||
|
||||
|
||||
def bulid_prompt(description, old_version, old_code, new_version) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
GPT performs line level generation prediction and truncates overly long tokens
|
||||
"""
|
||||
"""GPT performs line level generation prediction and truncates overly long tokens"""
|
||||
|
||||
import json
|
||||
import os
|
||||
@@ -56,8 +54,7 @@ def predict(content, model_name):
|
||||
|
||||
|
||||
def bulid_prompt(version, description) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
block completion
|
||||
"""
|
||||
"""block completion"""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
@@ -79,8 +77,7 @@ def run_inference(model_name, origin_data_list):
|
||||
|
||||
|
||||
def bulid_prompt(version, description) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
code migration
|
||||
"""
|
||||
"""code migration"""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
@@ -81,8 +79,7 @@ def run_inference(model_name, origin_data_list):
|
||||
|
||||
|
||||
def bulid_prompt(description, old_version, old_code, new_version) -> str:
|
||||
"""
|
||||
build prompt
|
||||
"""Build prompt
|
||||
:param version:
|
||||
:param description:
|
||||
:param masked_code:
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
"""
|
||||
评测block的预测能力
|
||||
"""评测block的预测能力
|
||||
1、判断是否包含正确的函数名
|
||||
2、判断是否合法
|
||||
3、计算ISM,和PM
|
||||
@@ -22,8 +21,7 @@ def is_code_valid(code):
|
||||
|
||||
|
||||
def longest_common_prefix_between_lists_with_elements(list1, list2):
|
||||
"""
|
||||
计算两个字符串列表中元素的最长前缀匹配长度
|
||||
"""计算两个字符串列表中元素的最长前缀匹配长度
|
||||
:param list1:
|
||||
:param list2:
|
||||
:return:
|
||||
@@ -46,8 +44,7 @@ def longest_common_prefix_between_lists_with_elements(list1, list2):
|
||||
|
||||
|
||||
def get_token(ans_code: str, output_code: str):
|
||||
"""
|
||||
对代码进行词法分析,分解成标识符,返回两个标识符列表
|
||||
"""对代码进行词法分析,分解成标识符,返回两个标识符列表
|
||||
:param ans_code:
|
||||
:param output_code:
|
||||
:return:
|
||||
@@ -94,8 +91,7 @@ def get_token(ans_code: str, output_code: str):
|
||||
|
||||
|
||||
def get_token_per_line(code: str):
|
||||
"""
|
||||
对每一行代码进行词法分析,记录每一行的标识符
|
||||
"""对每一行代码进行词法分析,记录每一行的标识符
|
||||
:param code: 代码字符串
|
||||
:return: 每一行的标识符列表组成的列表
|
||||
"""
|
||||
@@ -117,8 +113,7 @@ def get_token_per_line(code: str):
|
||||
|
||||
|
||||
def get_ISM(answer_code: str, model_output_list: list, answer_name: str) -> list:
|
||||
"""
|
||||
计算ISM,返回一个有序的得分列表
|
||||
"""计算ISM,返回一个有序的得分列表
|
||||
:return:
|
||||
"""
|
||||
score_list = []
|
||||
@@ -157,8 +152,7 @@ def get_ISM(answer_code: str, model_output_list: list, answer_name: str) -> list
|
||||
def get_ISM_without_verification(
|
||||
answer_code: str, model_output_list: list, answer_name: str
|
||||
) -> list:
|
||||
"""
|
||||
计算ISM,返回一个有序的得分列表
|
||||
"""计算ISM,返回一个有序的得分列表
|
||||
:return:
|
||||
"""
|
||||
score_list = []
|
||||
@@ -190,8 +184,7 @@ def get_ISM_without_verification(
|
||||
|
||||
|
||||
def longest_common_prefix_with_lengths(list1, list2):
|
||||
"""
|
||||
计算两个二维列表中每个子列表的最长前缀匹配长度,并记录拥有最长前缀匹配长度的两个子列表的长度
|
||||
"""计算两个二维列表中每个子列表的最长前缀匹配长度,并记录拥有最长前缀匹配长度的两个子列表的长度
|
||||
:param list1: 第一个二维列表
|
||||
:param list2: 第二个二维列表
|
||||
:return: 最长前缀匹配长度以及拥有最长前缀匹配长度的两个子列表的长度
|
||||
@@ -216,8 +209,7 @@ def longest_common_prefix_with_lengths(list1, list2):
|
||||
|
||||
|
||||
def get_PM(answer_code: str, model_output_list: list, answer_name: str) -> list:
|
||||
"""
|
||||
计算PM,返回一个有序的得分列表
|
||||
"""计算PM,返回一个有序的得分列表
|
||||
:return:
|
||||
"""
|
||||
score_list = []
|
||||
@@ -254,8 +246,7 @@ def get_PM(answer_code: str, model_output_list: list, answer_name: str) -> list:
|
||||
|
||||
|
||||
def get_score(score_list: list, k):
|
||||
"""
|
||||
计算score@n,k
|
||||
"""计算score@n,k
|
||||
:param score_list:
|
||||
:param k:
|
||||
:return:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Calculate the cdc score for migration
|
||||
"""
|
||||
"""Calculate the cdc score for migration"""
|
||||
|
||||
import json
|
||||
import math
|
||||
@@ -11,8 +9,7 @@ import re
|
||||
|
||||
|
||||
def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断参数数量是否一致
|
||||
"""判断参数数量是否一致
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -43,8 +40,7 @@ def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断关键词参数赋值是否正确使用
|
||||
"""判断关键词参数赋值是否正确使用
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -82,8 +78,7 @@ def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def with_correct(answer_code: str, model_output: str) -> bool:
|
||||
"""
|
||||
当answer是with结构时,判断模型生成的是不是with结构
|
||||
"""当answer是with结构时,判断模型生成的是不是with结构
|
||||
:param answer_code:
|
||||
:param model_output:
|
||||
:return:
|
||||
@@ -105,9 +100,7 @@ def compute_block_score_k(
|
||||
core_line_in_core_block,
|
||||
core_line_in_output_clear,
|
||||
):
|
||||
"""
|
||||
cdc需要满足五个条件,em只需要满足第一个条件
|
||||
"""
|
||||
"""cdc需要满足五个条件,em只需要满足第一个条件"""
|
||||
c = 0
|
||||
n = len(model_output)
|
||||
for index, code in enumerate(model_output):
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Calculate the cdc score for line and block
|
||||
"""
|
||||
"""Calculate the cdc score for line and block"""
|
||||
|
||||
import json
|
||||
import math
|
||||
@@ -19,8 +17,7 @@ def is_code_valid(code):
|
||||
|
||||
|
||||
def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断参数数量是否一致
|
||||
"""判断参数数量是否一致
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -51,8 +48,7 @@ def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断关键词参数赋值是否正确使用
|
||||
"""判断关键词参数赋值是否正确使用
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -90,8 +86,7 @@ def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def with_correct(answer_code: str, model_output: str) -> bool:
|
||||
"""
|
||||
当answer是with结构时,判断模型生成的是不是with结构
|
||||
"""当answer是with结构时,判断模型生成的是不是with结构
|
||||
:param answer_code:
|
||||
:param model_output:
|
||||
:return:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Calculate the cdc score for line and block
|
||||
"""
|
||||
"""Calculate the cdc score for line and block"""
|
||||
|
||||
import json
|
||||
import math
|
||||
@@ -19,8 +17,7 @@ def is_code_valid(code):
|
||||
|
||||
|
||||
def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断参数数量是否一致
|
||||
"""判断参数数量是否一致
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -51,8 +48,7 @@ def is_correct_parameter_count(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
"""
|
||||
判断关键词参数赋值是否正确使用
|
||||
"""判断关键词参数赋值是否正确使用
|
||||
:param function_name:
|
||||
:param correct_code:
|
||||
:param test_code:
|
||||
@@ -90,8 +86,7 @@ def check_keyword_parameters(function_name, correct_code, test_code):
|
||||
|
||||
|
||||
def with_correct(answer_code: str, model_output: str) -> bool:
|
||||
"""
|
||||
当answer是with结构时,判断模型生成的是不是with结构
|
||||
"""当answer是with结构时,判断模型生成的是不是with结构
|
||||
:param answer_code:
|
||||
:param model_output:
|
||||
:return:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Find the line of code generated by the model using the block in the version code
|
||||
"""
|
||||
"""Find the line of code generated by the model using the block in the version code"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Find the line of code generated by the model using the block in the version code
|
||||
"""
|
||||
"""Find the line of code generated by the model using the block in the version code"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
"""
|
||||
Clear the<start>and<end>generated by the model in inference
|
||||
"""
|
||||
"""Clear the<start>and<end>generated by the model in inference"""
|
||||
|
||||
import json
|
||||
|
||||
|
||||
@@ -622,8 +622,7 @@ def compatibility_for_eval_history_pairs(
|
||||
|
||||
|
||||
def is_fatal_evaluation_error(error: str | None) -> bool:
|
||||
"""
|
||||
The AgentController class overrides last error for certain exceptions
|
||||
"""The AgentController class overrides last error for certain exceptions
|
||||
We want to ensure those exeption do not overlap with fatal exceptions defined here
|
||||
This is because we do a comparisino against the stringified error
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user