mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
chore(lint): Apply comprehensive linting and formatting fixes (#10287)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
@@ -181,9 +181,7 @@ def distinct_methods_stats(tree, num_lines):
|
||||
|
||||
|
||||
def loops_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of loops.
|
||||
"""
|
||||
"""Calculate the average number of loops."""
|
||||
total_loops = 0
|
||||
|
||||
def traverse(node):
|
||||
@@ -199,9 +197,7 @@ def loops_stats(tree, num_lines):
|
||||
|
||||
|
||||
def branches_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of branches (conditional statements).
|
||||
"""
|
||||
"""Calculate the average number of branches (conditional statements)."""
|
||||
total_branches = 0
|
||||
|
||||
def traverse(node):
|
||||
|
||||
@@ -192,8 +192,7 @@ def run_mutation_testing(
|
||||
def grade_test_output(
|
||||
test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime
|
||||
):
|
||||
"""
|
||||
Two-pass test grading with short-circuiting:
|
||||
"""Two-pass test grading with short-circuiting:
|
||||
1. Run all tests to identify passing/failing tests
|
||||
2. If no failing tests, evaluate coverage immediately
|
||||
3. Otherwise, run only passing tests for coverage analysis
|
||||
@@ -280,8 +279,7 @@ def process_instance(
|
||||
reset_logger: bool = True,
|
||||
log_dir: str | None = None,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a TestGenEval problem instance.
|
||||
"""Evaluate agent performance on a TestGenEval problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
@@ -453,8 +451,7 @@ def process_instance(
|
||||
|
||||
|
||||
def count_and_log_fields(evaluated_predictions, fields, key):
|
||||
"""
|
||||
Count and log the sum of specified fields in the evaluated predictions,
|
||||
"""Count and log the sum of specified fields in the evaluated predictions,
|
||||
ignoring fields with a value of -1. If all values for a field are -1,
|
||||
return -1.
|
||||
|
||||
|
||||
@@ -4,8 +4,7 @@ from evaluation.benchmarks.testgeneval.constants import TestStatus
|
||||
|
||||
|
||||
def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
"""Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -26,8 +25,7 @@ def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework with options
|
||||
"""Parser for test logs generated with PyTest framework with options
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -61,8 +59,7 @@ def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_django(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Django tester framework
|
||||
"""Parser for test logs generated with Django tester framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -141,8 +138,7 @@ def parse_log_django(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework (Later Version)
|
||||
"""Parser for test logs generated with PyTest framework (Later Version)
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -170,8 +166,7 @@ def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with seaborn testing framework
|
||||
"""Parser for test logs generated with seaborn testing framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -196,8 +191,7 @@ def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Sympy framework
|
||||
"""Parser for test logs generated with Sympy framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
@@ -229,8 +223,7 @@ def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
|
||||
|
||||
def parse_log_matplotlib(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
"""Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
|
||||
@@ -12,8 +12,7 @@ if sys.getrecursionlimit() < 10_000:
|
||||
|
||||
|
||||
def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
"""
|
||||
Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
"""Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
:param pred: list of predicted tokens
|
||||
@@ -30,8 +29,7 @@ def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
|
||||
|
||||
def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
"""
|
||||
Calculate BLEU score for a batch of sentences.
|
||||
"""Calculate BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -43,8 +41,7 @@ def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
|
||||
|
||||
def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate corpus-level BLEU score for a batch of sentences.
|
||||
"""Calculate corpus-level BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -63,8 +60,7 @@ def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
def edit_sim(
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> float:
|
||||
"""
|
||||
Calculate char-level edit similarity, in the range of 0~100.
|
||||
"""Calculate char-level edit similarity, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -85,8 +81,7 @@ def batch_edit_sim(
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate char-level edit similarity for a batch of sentences.
|
||||
"""Calculate char-level edit similarity for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -102,8 +97,7 @@ T = TypeVar('T')
|
||||
|
||||
|
||||
def exact_match(gold: T, pred: T) -> float:
|
||||
"""
|
||||
Calculate exact match accuracy, in the range of {0, 100}.
|
||||
"""Calculate exact match accuracy, in the range of {0, 100}.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -115,8 +109,7 @@ def exact_match(gold: T, pred: T) -> float:
|
||||
|
||||
|
||||
def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
"""
|
||||
Calculate exact match accuracy for a batch of sentences.
|
||||
"""Calculate exact match accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -130,8 +123,7 @@ def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
def rouge_l(
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
"""Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
@@ -156,8 +148,7 @@ def batch_rouge_l(
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> dict[str, list[float]]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
"""Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -175,8 +166,7 @@ def accuracy(
|
||||
pred: list[str],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate token-level accuracy, in the range of 0~100.
|
||||
"""Calculate token-level accuracy, in the range of 0~100.
|
||||
If gold and pred are not the same length, the longer one would be truncated.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
@@ -210,8 +200,7 @@ def batch_accuracy(
|
||||
preds: list[list[str]],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate token-level accuracy for a batch of sentences.
|
||||
"""Calculate token-level accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
@@ -226,8 +215,7 @@ def batch_accuracy(
|
||||
def first_match_to_topk(
|
||||
first_match_list: list[int], k_values: list[int]
|
||||
) -> dict[int, list[float]]:
|
||||
"""
|
||||
Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
"""Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
|
||||
:param first_match: first match ranks (1-indexed)
|
||||
:param k_values: k values to consider
|
||||
@@ -237,8 +225,7 @@ def first_match_to_topk(
|
||||
|
||||
|
||||
def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
"""
|
||||
Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
"""Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
:param n: total number of samples
|
||||
:param c: number of correct samples
|
||||
:param k: k in pass@$k$
|
||||
@@ -251,8 +238,7 @@ def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
|
||||
|
||||
def self_bleu(samples: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate self-BLEU among the samples.
|
||||
"""Calculate self-BLEU among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:return: self-BLEU
|
||||
"""
|
||||
@@ -274,8 +260,7 @@ def self_bleu(samples: list[list[str]]) -> float:
|
||||
|
||||
|
||||
def self_edit_distance(samples: list[Union[str, list[str]]], sep=' ') -> float:
|
||||
"""
|
||||
Calculate self-edit-distance among the samples.
|
||||
"""Calculate self-edit-distance among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:param sep: the separator between tokens
|
||||
:return: self-edit-distance
|
||||
|
||||
@@ -30,8 +30,7 @@ def check_mutation(mutation_output):
|
||||
|
||||
|
||||
def count_methods(code_str):
|
||||
"""
|
||||
Counts the number of methods/functions in a given string of code.
|
||||
"""Counts the number of methods/functions in a given string of code.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
@@ -46,8 +45,7 @@ def count_methods(code_str):
|
||||
|
||||
|
||||
def get_lines_of_code(code_str):
|
||||
"""
|
||||
Extracts lines of code from a given string.
|
||||
"""Extracts lines of code from a given string.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
|
||||
@@ -7,8 +7,7 @@ import traceback
|
||||
|
||||
|
||||
def insert_line_in_string(input_string, new_str, insert_line):
|
||||
"""
|
||||
Inserts a new line into a string at the specified line number.
|
||||
"""Inserts a new line into a string at the specified line number.
|
||||
|
||||
:param input_string: The original string.
|
||||
:param new_str: The string to insert.
|
||||
@@ -29,8 +28,7 @@ def insert_line_in_string(input_string, new_str, insert_line):
|
||||
|
||||
|
||||
def print_string_diff(original, modified):
|
||||
"""
|
||||
Prints the differences between two strings line by line.
|
||||
"""Prints the differences between two strings line by line.
|
||||
|
||||
:param original: The original string.
|
||||
:param modified: The modified string.
|
||||
|
||||
@@ -37,8 +37,7 @@ def extract_preamble_classes_and_functions(code):
|
||||
current_position = 0
|
||||
|
||||
def extract_class_body(code: str, start_index: int) -> tuple[str, int]:
|
||||
"""
|
||||
Extracts the body of a class from the given code starting from the specified index.
|
||||
"""Extracts the body of a class from the given code starting from the specified index.
|
||||
Returns the class body and the end index of the class body.
|
||||
"""
|
||||
if not code or start_index < 0 or start_index >= len(code):
|
||||
@@ -168,8 +167,8 @@ def extract_preamble_classes_and_functions(code):
|
||||
def filter_passing_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests based on their execution results.
|
||||
"""Filter tests based on their execution results.
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Modified test content with only passing tests
|
||||
@@ -246,8 +245,7 @@ def filter_passing_tests(
|
||||
def filter_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
"""Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
|
||||
are preserved.
|
||||
|
||||
|
||||
@@ -20,9 +20,7 @@ DIFF_MODIFIED_FILE_REGEX = r'--- a/(.*)'
|
||||
|
||||
@dataclass
|
||||
class TestSpec:
|
||||
"""
|
||||
A dataclass that represents a test specification for a single instance of SWE-bench.
|
||||
"""
|
||||
"""A dataclass that represents a test specification for a single instance of SWE-bench."""
|
||||
|
||||
instance_id: str
|
||||
id: str
|
||||
@@ -86,10 +84,7 @@ def make_test_setup(specs, env_name, repo_directory, includes_tox=False):
|
||||
|
||||
|
||||
def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
"""Runs the tests."""
|
||||
includes_tox = 'tox' in test_cmd
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory, includes_tox)
|
||||
eval_commands += [
|
||||
@@ -104,10 +99,7 @@ def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
|
||||
|
||||
def make_mutation_script_list(specs, env_name, repo_directory, mutation_timeout):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
"""Runs the tests."""
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory)
|
||||
eval_commands += [
|
||||
'cosmic-ray init mutation.toml mutation.sqlite',
|
||||
|
||||
@@ -11,8 +11,7 @@ from evaluation.benchmarks.testgeneval.constants import (
|
||||
|
||||
|
||||
def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
"""
|
||||
Get test directives from the test_patch of a task instance
|
||||
"""Get test directives from the test_patch of a task instance
|
||||
|
||||
Args:
|
||||
instance (dict): task instance
|
||||
@@ -43,9 +42,7 @@ def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
def load_testgeneval_dataset(
|
||||
name='kjain14/testgeneval', split='test', ids=None
|
||||
) -> list[TestGenEvalInstance]:
|
||||
"""
|
||||
Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
|
||||
"""
|
||||
"""Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file"""
|
||||
# check that all instance IDs are in the dataset
|
||||
if ids:
|
||||
ids = set(ids)
|
||||
|
||||
Reference in New Issue
Block a user