mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
Add TestGenEval benchmark (#5534)
Co-authored-by: Kush Dave Jain <kdjain@pit.isri.cmu.edu> Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
12
evaluation/benchmarks/testgeneval/NOTES.md
Normal file
12
evaluation/benchmarks/testgeneval/NOTES.md
Normal file
@@ -0,0 +1,12 @@
|
||||
codamosa_ids = ['pydata__xarray-4750-16496', 'pydata__xarray-3239-16458', 'pydata__xarray-4966-16515', 'pydata__xarray-3302-16459', 'pydata__xarray-5126-16518', 'pydata__xarray-4994-16516', 'pydata__xarray-3905-16478', 'pydata__xarray-4182-16484', 'pydata__xarray-5131-16520', 'pydata__xarray-5662-16532', 'pydata__xarray-3364-16461', 'pydata__xarray-5731-16534', 'pydata__xarray-3239-16457', 'pydata__xarray-7203-16577', 'pydata__xarray-3156-16454', 'pydata__xarray-5126-16519', 'pydata__xarray-5365-16529', 'pydata__xarray-4629-16492', 'pydata__xarray-4248-16486', 'pydata__xarray-4339-16487', 'pydata__xarray-3151-16453', 'pydata__xarray-3114-16452', 'pydata__xarray-5033-16517', 'pydata__xarray-4802-16505', 'pydata__xarray-5455-16530', 'pydata__xarray-6400-16539', 'pydata__xarray-3239-16456', 'pydata__xarray-4419-16488']
|
||||
|
||||
pynguin_ids = ['pydata__xarray-6548-16541', 'pydata__xarray-7003-16557', 'pydata__xarray-3114-16452', 'pydata__xarray-4339-16487', 'pydata__xarray-6889-16549', 'pydata__xarray-3239-16458', 'pydata__xarray-3364-16461', 'pydata__xarray-3239-16457', 'pydata__xarray-5365-16529', 'pydata__xarray-5131-16520', 'pydata__xarray-7229-16578', 'pydata__xarray-6461-16540', 'pydata__xarray-4419-16488', 'pydata__xarray-7147-16571', 'pydata__xarray-3151-16453', 'pydata__xarray-4966-16515', 'pydata__xarray-4629-16492', 'pydata__xarray-3239-16456', 'pydata__xarray-7400-16582', 'pydata__xarray-4994-16516', 'pydata__xarray-3302-16459', 'pydata__xarray-6601-16544', 'pydata__xarray-6882-16548', 'pydata__xarray-6135-16535', 'pydata__xarray-7393-16581', 'pydata__xarray-5731-16534', 'pydata__xarray-7203-16577']
|
||||
|
||||
ids = ['pydata__xarray-3114-16452', 'pydata__xarray-3151-16453', 'pydata__xarray-3156-16454', 'pydata__xarray-3239-16456', 'pydata__xarray-3239-16457', 'pydata__xarray-3239-16458', 'pydata__xarray-3302-16459', 'pydata__xarray-3364-16461', 'pydata__xarray-3677-16471', 'pydata__xarray-3905-16478', 'pydata__xarray-4182-16484', 'pydata__xarray-4248-16486', 'pydata__xarray-4339-16487', 'pydata__xarray-4419-16488', 'pydata__xarray-4629-16492', 'pydata__xarray-4750-16496', 'pydata__xarray-4802-16505', 'pydata__xarray-4966-16515', 'pydata__xarray-4994-16516', 'pydata__xarray-5033-16517', 'pydata__xarray-5126-16518', 'pydata__xarray-5126-16519', 'pydata__xarray-5131-16520', 'pydata__xarray-5365-16529', 'pydata__xarray-5455-16530', 'pydata__xarray-5662-16532', 'pydata__xarray-5731-16534', 'pydata__xarray-6135-16535', 'pydata__xarray-6135-16536', 'pydata__xarray-6386-16537', 'pydata__xarray-6394-16538', 'pydata__xarray-6400-16539', 'pydata__xarray-6461-16540', 'pydata__xarray-6548-16541', 'pydata__xarray-6599-16543', 'pydata__xarray-6601-16544', 'pydata__xarray-6882-16548', 'pydata__xarray-6889-16549', 'pydata__xarray-7003-16557', 'pydata__xarray-7147-16571', 'pydata__xarray-7150-16572', 'pydata__xarray-7203-16577', 'pydata__xarray-7229-16578', 'pydata__xarray-7393-16581', 'pydata__xarray-7400-16582']
|
||||
|
||||
|
||||
Command eval (our approach):
|
||||
poetry run ./evaluation/benchmarks/testgeneval/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/kjain14__testgeneval-test/CodeActAgent/gpt-4o_maxiter_25_N_v0.20.0-no-hint-run_1/output.jsonl 10 kjain14/testgeneval test true
|
||||
|
||||
Command run (our approach):
|
||||
./evaluation/benchmarks/testgeneval/scripts/run_infer.sh llm.eval_gpt HEAD CodeActAgent -1 25 10 kjain14/testgeneval test 1 ../TestGenEval/results/testgeneval/preds/gpt-4o-2024-08-06__testgeneval__0.2__test.jsonl
|
||||
80
evaluation/benchmarks/testgeneval/README.md
Normal file
80
evaluation/benchmarks/testgeneval/README.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# TestGenEval Benchmark Evaluation
|
||||
|
||||
This folder contains the evaluation harness for the TestGenEval benchmark, which is based on the original TestGenEval benchmark ([paper](https://arxiv.org/abs/2410.00752)). TestGenEval is designed to evaluate the ability of language models to generate unit tests for given Python functions.
|
||||
|
||||
## Setup Environment and LLM Configuration
|
||||
|
||||
1. Follow the instructions [here](../../README.md#setup) to set up your local development environment and configure your LLM.
|
||||
|
||||
2. Install the TestGenEval dependencies:
|
||||
```bash
|
||||
poetry install --with testgeneval
|
||||
```
|
||||
|
||||
## Run Inference
|
||||
|
||||
To generate tests using your model, run the following command:
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/testgeneval/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
|
||||
# Example
|
||||
./evaluation/benchmarks/testgeneval/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 100 30 1 kjain14/testgenevallite test
|
||||
```
|
||||
|
||||
Parameters:
|
||||
- `model_config`: The config group name for your LLM settings (e.g., `eval_gpt4_1106_preview`)
|
||||
- `git-version`: The git commit hash or release tag of OpenHands to evaluate (e.g., `HEAD` or `0.6.2`)
|
||||
- `agent`: The name of the agent for benchmarks (default: `CodeActAgent`)
|
||||
- `eval_limit`: Limit the evaluation to the first N instances (optional)
|
||||
- `max_iter`: Maximum number of iterations for the agent to run (default: 30)
|
||||
- `num_workers`: Number of parallel workers for evaluation (default: 1)
|
||||
- `dataset`: HuggingFace dataset name (default: `kjain14/testgenevallite`)
|
||||
- `dataset_split`: Dataset split to use (default: `test`)
|
||||
|
||||
After running the inference, you will obtain an `output.jsonl` file (by default saved to `evaluation/evaluation_outputs`).
|
||||
|
||||
## Evaluate Generated Tests
|
||||
|
||||
To evaluate the generated tests, use the `eval_infer.sh` script:
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/testgeneval/scripts/eval_infer.sh $YOUR_OUTPUT_JSONL [instance_id] [dataset_name] [split] [num_workers] [skip_mutation]
|
||||
|
||||
# Example
|
||||
./evaluation/benchmarks/testgeneval/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/kjain14__testgenevallite-test/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
|
||||
```
|
||||
|
||||
Optional arguments:
|
||||
- `instance_id`: Evaluate a single instance (optional)
|
||||
- `dataset_name`: Name of the dataset to use (default: `kjain14/testgenevallite`)
|
||||
- `split`: Dataset split to use (default: `test`)
|
||||
- `num_workers`: Number of workers for running docker (default: 1)
|
||||
- `skip_mutation`: Skip mutation testing (enter `true` if desired)
|
||||
|
||||
The evaluation results will be saved to `evaluation/evaluation_outputs/outputs/kjain14__testgenevallite-test/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with `output.testgeneval.jsonl` containing the metrics.
|
||||
|
||||
## Metrics
|
||||
|
||||
The TestGenEval benchmark evaluates generated tests based on the following metrics:
|
||||
|
||||
1. Correctness: Measures if the generated tests are syntactically correct and run without errors.
|
||||
2. Coverage: Assesses the code coverage achieved by the generated tests.
|
||||
3. Mutation Score: Evaluates the effectiveness of the tests in detecting intentionally introduced bugs (mutations).
|
||||
4. Readability: Analyzes the readability of the generated tests using various metrics.
|
||||
|
||||
## Submit Your Evaluation Results
|
||||
|
||||
To contribute your evaluation results:
|
||||
|
||||
1. Fork [our HuggingFace evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation).
|
||||
2. Add your results to the forked repository.
|
||||
3. Submit a Pull Request with your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [TestGenEval Paper](https://arxiv.org/abs/2410.00752)
|
||||
- [OpenHands Documentation](https://github.com/All-Hands-AI/OpenHands)
|
||||
- [HuggingFace Datasets](https://huggingface.co/datasets)
|
||||
|
||||
For any questions or issues, please open an issue in the [OpenHands repository](https://github.com/All-Hands-AI/OpenHands/issues).
|
||||
0
evaluation/benchmarks/testgeneval/__init__.py
Normal file
0
evaluation/benchmarks/testgeneval/__init__.py
Normal file
351
evaluation/benchmarks/testgeneval/compute_readability.py
Normal file
351
evaluation/benchmarks/testgeneval/compute_readability.py
Normal file
@@ -0,0 +1,351 @@
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from tree_sitter import Language, Parser
|
||||
|
||||
|
||||
def total_byte_entropy_stats(python_code):
|
||||
# Count the occurrence of each byte (character for simplicity)
|
||||
byte_counts = {}
|
||||
for byte in python_code.encode('utf-8'):
|
||||
byte_counts[byte] = byte_counts.get(byte, 0) + 1
|
||||
|
||||
total_bytes = sum(byte_counts.values())
|
||||
entropy = -sum(
|
||||
(count / total_bytes) * math.log2(count / total_bytes)
|
||||
for count in byte_counts.values()
|
||||
)
|
||||
|
||||
return {'total_byte_entropy': entropy}
|
||||
|
||||
|
||||
def average_nulls_stats(tree, num_lines):
|
||||
total_nulls = 0
|
||||
nulls_per_line = {} # Dictionary to count nulls per line
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_nulls
|
||||
if node.type == 'null_literal':
|
||||
total_nulls += 1
|
||||
line_number = node.start_point[0] # Get line number
|
||||
if line_number in nulls_per_line:
|
||||
nulls_per_line[line_number] += 1
|
||||
else:
|
||||
nulls_per_line[line_number] = 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
|
||||
# Calculate average nulls per line
|
||||
avg_nulls = total_nulls / num_lines if num_lines > 0 else 0
|
||||
|
||||
# Calculate max nulls on any line
|
||||
max_nulls_on_any_line = max(nulls_per_line.values()) if nulls_per_line else 0
|
||||
|
||||
return {
|
||||
'avg_nulls': avg_nulls,
|
||||
'total_nulls': total_nulls,
|
||||
'max_nulls': max_nulls_on_any_line,
|
||||
'has_nulls': 1 if total_nulls > 0 else 0,
|
||||
}
|
||||
|
||||
|
||||
def arithmetic_operations_stats(tree, num_lines):
|
||||
# Dictionary to hold counts of each arithmetic operation
|
||||
op_counts = {'+': 0, '-': 0, '*': 0, '/': 0, '%': 0}
|
||||
total_ops = 0
|
||||
|
||||
# Function to traverse the AST and update operation counts
|
||||
def traverse(node):
|
||||
nonlocal total_ops
|
||||
if node.type == 'binary_expression' or node.type == 'update_expression':
|
||||
for child in node.children:
|
||||
if child.type == 'operator':
|
||||
op = child.text.decode('utf8')
|
||||
if op in op_counts:
|
||||
op_counts[op] += 1
|
||||
total_ops += 1
|
||||
else:
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
|
||||
return {
|
||||
'total_arithmetic_operations': total_ops,
|
||||
'avg_arithmetic_operations': total_ops / num_lines,
|
||||
}
|
||||
|
||||
|
||||
def numbers_floats_stats(tree, num_lines):
|
||||
total_numbers = 0
|
||||
total_floats = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_numbers, total_floats
|
||||
if node.type in ['integer_literal', 'decimal_literal']:
|
||||
total_numbers += 1
|
||||
if (
|
||||
'.' in node.text.decode('utf8')
|
||||
or 'e' in node.text.decode('utf8').lower()
|
||||
):
|
||||
total_floats += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
return {'total_numbers': total_numbers, 'total_floats': total_floats}
|
||||
|
||||
|
||||
def code_stats(python_code):
|
||||
lines = python_code.strip().split('\n')
|
||||
total_line_length = sum(len(line) for line in lines)
|
||||
max_line_length = max(len(line) for line in lines)
|
||||
return {
|
||||
'total_line_length': total_line_length,
|
||||
'max_line_length': max_line_length,
|
||||
'avg_characters': total_line_length / len(lines),
|
||||
}
|
||||
|
||||
|
||||
def assertions_stats(tree, num_lines):
|
||||
total_assertions = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_assertions
|
||||
if node.type == 'assert_statement':
|
||||
total_assertions += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
return {
|
||||
'total_assertions': total_assertions,
|
||||
'total_has_assertions': 1 if total_assertions > 0 else 0,
|
||||
}
|
||||
|
||||
|
||||
def class_instances_stats(tree, num_lines):
|
||||
total_class_instances = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_class_instances
|
||||
if node.type == 'object_creation_expression':
|
||||
total_class_instances += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
return {'total_class_instances': total_class_instances}
|
||||
|
||||
|
||||
def has_execeptions(tree, num_lines):
|
||||
total_has_exceptions = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_has_exceptions
|
||||
if node.type == 'try_statement':
|
||||
total_has_exceptions += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
return {'total_has_exceptions': 1 if total_has_exceptions > 0 else 0}
|
||||
|
||||
|
||||
def distinct_methods_stats(tree, num_lines):
|
||||
method_names = set()
|
||||
total_nodes = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_nodes
|
||||
if node.type == 'method_declaration':
|
||||
for child in node.children:
|
||||
if child.type == 'identifier':
|
||||
method_names.add(child.text.decode('utf8'))
|
||||
break
|
||||
total_nodes += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
total_distinct_methods = len(method_names)
|
||||
total_method_ratio = (
|
||||
total_distinct_methods / (total_nodes - total_distinct_methods)
|
||||
if total_nodes > total_distinct_methods
|
||||
else 0
|
||||
)
|
||||
|
||||
return {
|
||||
'total_distinct_methods': total_distinct_methods,
|
||||
'total_method_ratio': total_method_ratio,
|
||||
}
|
||||
|
||||
|
||||
def loops_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of loops.
|
||||
"""
|
||||
total_loops = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_loops
|
||||
if node.type in ['for_statement', 'while_statement', 'do_statement']:
|
||||
total_loops += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
avg_loops = total_loops / num_lines
|
||||
return {'avg_loops': avg_loops}
|
||||
|
||||
|
||||
def branches_stats(tree, num_lines):
|
||||
"""
|
||||
Calculate the average number of branches (conditional statements).
|
||||
"""
|
||||
total_branches = 0
|
||||
|
||||
def traverse(node):
|
||||
nonlocal total_branches
|
||||
if node.type in ['if_statement', 'switch_statement']:
|
||||
total_branches += 1
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
# Assuming each branch is its own, this might need refinement based on definition
|
||||
avg_branches = total_branches / num_lines
|
||||
return {'avg_branches': avg_branches}
|
||||
|
||||
|
||||
def string_stats(tree, num_lines):
|
||||
string_literals = []
|
||||
|
||||
# Function to traverse the AST and collect string literals
|
||||
def traverse(node):
|
||||
if node.type == 'string_literal':
|
||||
# Extracting the string literal, excluding the quotation marks
|
||||
literal_text = node.text.decode('utf8')[1:-1]
|
||||
string_literals.append(literal_text)
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
|
||||
traverse(tree.root_node)
|
||||
|
||||
# Calculate the average string length
|
||||
total_length = sum(len(s) for s in string_literals)
|
||||
avg_length = total_length / num_lines
|
||||
return {'avg_str_length': avg_length}
|
||||
|
||||
|
||||
def identifier_stats(tree, num_lines):
|
||||
root_node = tree.root_node
|
||||
identifier_counts = {} # Dictionary to count occurrences of each identifier
|
||||
total_nodes = 0 # Counter for all nodes
|
||||
|
||||
# Function to recursively count identifiers and all nodes, gathering their stats
|
||||
def count(node):
|
||||
nonlocal identifier_counts, total_nodes
|
||||
iden_count = 0
|
||||
max_length = 0
|
||||
total_nodes += 1 # Increment total nodes for every node visited
|
||||
if node.type == 'identifier':
|
||||
identifier = node.text.decode('utf8') # Assuming UTF-8 encoding
|
||||
iden_count += 1
|
||||
identifier_counts[identifier] = identifier_counts.get(identifier, 0) + 1
|
||||
iden_length = len(identifier)
|
||||
if iden_length > max_length:
|
||||
max_length = iden_length
|
||||
for child in node.children:
|
||||
child_count, child_max_length = count(child)
|
||||
iden_count += child_count
|
||||
if child_max_length > max_length:
|
||||
max_length = child_max_length
|
||||
return iden_count, max_length
|
||||
|
||||
total_identifiers, max_identifier_length = count(root_node)
|
||||
total_unique_identifiers = len(identifier_counts)
|
||||
total_identifier_length = sum(len(k) * v for k, v in identifier_counts.items())
|
||||
avg_identifier_length = total_identifier_length / num_lines
|
||||
|
||||
# Calculate the identifier ratio as total identifiers over total nodes
|
||||
identifier_ratio = total_identifiers / total_nodes if total_nodes > 0 else 0
|
||||
|
||||
return {
|
||||
'total_identifiers': total_identifiers,
|
||||
'total_identifier_length': total_identifier_length,
|
||||
'max_identifier_length': max_identifier_length,
|
||||
'avg_identifier_length': avg_identifier_length,
|
||||
'total_unique_identifiers': total_unique_identifiers,
|
||||
'total_identifier_ratio': identifier_ratio, # Include the new ratio in the returned dictionary
|
||||
'total_nodes': total_nodes, # Include total node count for reference or further calculations
|
||||
}
|
||||
|
||||
|
||||
def compute_regression(results):
|
||||
components = {
|
||||
'total_line_length': -0.0001,
|
||||
'max_line_length': -0.0021,
|
||||
'total_identifiers': 0.0076,
|
||||
'total_identifier_length': -0.0004,
|
||||
'max_identifier_length': -0.0067,
|
||||
'avg_identifier_length': -0.005,
|
||||
'avg_arithmetic_operations': 0.0225,
|
||||
'avg_branches': 0.9886,
|
||||
'avg_loops': 0.1572,
|
||||
'total_assertions': 0.0119,
|
||||
'total_has_assertions': -0.0147,
|
||||
'avg_characters': 0.1242,
|
||||
'total_class_instances': -0.043,
|
||||
'total_distinct_methods': -0.0127,
|
||||
'avg_str_length': 0.0026,
|
||||
'total_has_exceptions': 0.1206,
|
||||
'total_unique_identifiers': -0.019,
|
||||
'max_nulls': -0.0712,
|
||||
'total_numbers': -0.0078,
|
||||
'avg_nulls': 0.1444,
|
||||
'total_identifier_ratio': 0.334,
|
||||
'total_method_ratio': 0.0406,
|
||||
'total_floats': -0.0174,
|
||||
'total_byte_entropy': -0.3917,
|
||||
}
|
||||
test_score = 0
|
||||
|
||||
for component in components:
|
||||
test_score += components[component] * results[component]
|
||||
|
||||
test_score += 5.7501
|
||||
return test_score
|
||||
|
||||
|
||||
def compute_readability(python_code):
|
||||
# Create parser and set up language
|
||||
import tree_sitter_python
|
||||
from tree_sitter import Parser, Language
|
||||
|
||||
parser = Parser(Language(tree_sitter_python.language()))
|
||||
|
||||
results = code_stats(python_code)
|
||||
|
||||
num_lines = len(python_code.strip().split('\n'))
|
||||
results.update(total_byte_entropy_stats(python_code))
|
||||
|
||||
tree = parser.parse(bytes(python_code, 'utf8'))
|
||||
|
||||
results.update(identifier_stats(tree, num_lines))
|
||||
results.update(loops_stats(tree, num_lines))
|
||||
results.update(branches_stats(tree, num_lines))
|
||||
results.update(distinct_methods_stats(tree, num_lines))
|
||||
results.update(has_execeptions(tree, num_lines))
|
||||
results.update(class_instances_stats(tree, num_lines))
|
||||
results.update(assertions_stats(tree, num_lines))
|
||||
results.update(numbers_floats_stats(tree, num_lines))
|
||||
results.update(average_nulls_stats(tree, num_lines))
|
||||
results.update(arithmetic_operations_stats(tree, num_lines))
|
||||
results.update(string_stats(tree, num_lines))
|
||||
|
||||
score = compute_regression(results)
|
||||
return score
|
||||
1578
evaluation/benchmarks/testgeneval/constants.py
Normal file
1578
evaluation/benchmarks/testgeneval/constants.py
Normal file
File diff suppressed because it is too large
Load Diff
629
evaluation/benchmarks/testgeneval/eval_infer.py
Normal file
629
evaluation/benchmarks/testgeneval/eval_infer.py
Normal file
@@ -0,0 +1,629 @@
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
from functools import partial
|
||||
|
||||
import pandas as pd
|
||||
from report_utils import (
|
||||
check_coverage,
|
||||
check_mutation,
|
||||
count_methods,
|
||||
get_lines_of_code,
|
||||
)
|
||||
|
||||
from evaluation.benchmarks.testgeneval.compute_readability import compute_readability
|
||||
from evaluation.benchmarks.testgeneval.constants import (
|
||||
COVERAGE_PREFIX,
|
||||
MUTATION_BUFFER,
|
||||
MUTATION_TEMPLATE,
|
||||
MUTATION_TIMEOUT,
|
||||
TESTS_SUFFIX,
|
||||
)
|
||||
from evaluation.benchmarks.testgeneval.metrics import (
|
||||
bleu,
|
||||
edit_sim,
|
||||
exact_match,
|
||||
rouge_l,
|
||||
)
|
||||
from evaluation.benchmarks.testgeneval.pygments_utils import tokenize_code
|
||||
from evaluation.benchmarks.testgeneval.run_infer import get_instance_docker_image
|
||||
from evaluation.benchmarks.testgeneval.test_filter import filter_tests
|
||||
from evaluation.benchmarks.testgeneval.test_spec import (
|
||||
TestGenEvalInstance,
|
||||
TestSpec,
|
||||
make_test_spec,
|
||||
)
|
||||
from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
)
|
||||
from openhands.core.config import AppConfig, SandboxConfig, get_parser
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime
|
||||
from openhands.events.action import CmdRunAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/kdjain/')
|
||||
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
|
||||
|
||||
|
||||
def get_config(instance: pd.Series) -> AppConfig:
|
||||
base_container_image = get_instance_docker_image(instance['instance_id_swebench'])
|
||||
assert (
|
||||
base_container_image
|
||||
), f"Invalid container image for instance {instance['instance_id_swebench']}."
|
||||
logger.info(f'Using instance container image: {base_container_image}.')
|
||||
return AppConfig(
|
||||
run_as_openhands=False,
|
||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image=base_container_image,
|
||||
use_host_network=False,
|
||||
timeout=1800,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY'),
|
||||
remote_runtime_api_url=os.environ.get(
|
||||
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
||||
),
|
||||
),
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
|
||||
|
||||
def compute_lexical_metrics(pred_suite, gold_suite):
|
||||
pred_loc = get_lines_of_code(pred_suite)
|
||||
gold_loc = get_lines_of_code(gold_suite)
|
||||
pred_methods = count_methods(pred_suite)
|
||||
gold_methods = count_methods(gold_suite)
|
||||
readability_pred = compute_readability(pred_suite)
|
||||
readability_gold = compute_readability(gold_suite)
|
||||
|
||||
preds = tokenize_code(pred_suite)
|
||||
golds = tokenize_code(gold_suite)
|
||||
|
||||
return {
|
||||
'pred_loc': pred_loc,
|
||||
'gold_loc': gold_loc,
|
||||
'pred_readability': readability_pred,
|
||||
'gold_readability': readability_gold,
|
||||
'pred_methods': pred_methods,
|
||||
'gold_methods': gold_methods,
|
||||
'bleu': bleu(preds, golds),
|
||||
'xmatch': exact_match(preds, golds),
|
||||
'edit_sim': edit_sim(preds, golds),
|
||||
'rouge_f': rouge_l(golds, preds)['f'],
|
||||
'rouge_p': rouge_l(golds, preds)['p'],
|
||||
'rouge_r': rouge_l(golds, preds)['r'],
|
||||
}
|
||||
|
||||
|
||||
def run_command(runtime, command, timeout=600):
|
||||
action = CmdRunAction(command=command)
|
||||
action.set_hard_timeout(timeout)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
return obs
|
||||
|
||||
|
||||
def run_tests(runtime, instance, test_script, log_file='/tmp/test_output.log'):
|
||||
action = CmdRunAction(command=f'bash {test_script} > {log_file} 2>&1 & echo $!')
|
||||
action.set_hard_timeout(60)
|
||||
obs = runtime.run_action(action)
|
||||
|
||||
assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.'
|
||||
pid = obs.content.split()[-1].strip()
|
||||
logger.info(f'[{instance.instance_id}] Test process started with PID: {pid}')
|
||||
|
||||
start_time = time.time()
|
||||
timeout = 1800
|
||||
while True:
|
||||
elapsed_time = time.time() - start_time
|
||||
if elapsed_time > timeout:
|
||||
logger.info(f'[{instance.instance_id}] Test process timed out.')
|
||||
instance['test_result']['report']['test_timeout'] = True
|
||||
break
|
||||
|
||||
check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?')
|
||||
check_obs = runtime.run_action(check_action)
|
||||
if (
|
||||
isinstance(check_obs, CmdOutputObservation)
|
||||
and len(check_obs.content.split()) > 0
|
||||
and check_obs.content.split()[-1].strip() == '1'
|
||||
):
|
||||
logger.info(f'[{instance.instance_id}] Test process completed.')
|
||||
break
|
||||
time.sleep(30)
|
||||
|
||||
test_action = CmdRunAction(command=f'cat {log_file}')
|
||||
test_action.set_hard_timeout(300)
|
||||
test_obs = runtime.run_action(test_action)
|
||||
assert isinstance(test_obs, CmdOutputObservation), 'Failed to retrieve test output.'
|
||||
return test_obs.exit_code, test_obs.content, elapsed_time
|
||||
|
||||
|
||||
def run_mutation_testing(
|
||||
runtime, instance, mutation_script, log_file='/tmp/mutation_output.log'
|
||||
):
|
||||
action = CmdRunAction(command=f'bash {mutation_script} > {log_file} 2>&1 & echo $!')
|
||||
action.set_hard_timeout(60)
|
||||
obs = runtime.run_action(action)
|
||||
|
||||
assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.'
|
||||
pid = obs.content.split()[-1].strip()
|
||||
logger.info(f'[{instance.instance_id}] Mutation process started with PID: {pid}')
|
||||
|
||||
start_time = time.time()
|
||||
timeout = 4000
|
||||
while True:
|
||||
elapsed_time = time.time() - start_time
|
||||
if elapsed_time > timeout:
|
||||
logger.info(f'[{instance.instance_id}] Mutation process timed out.')
|
||||
instance['test_result']['report']['mutation_timeout'] = True
|
||||
break
|
||||
|
||||
check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?')
|
||||
check_obs = runtime.run_action(check_action)
|
||||
if (
|
||||
isinstance(check_obs, CmdOutputObservation)
|
||||
and len(check_obs.content.split()) > 0
|
||||
and check_obs.content.split()[-1].strip() == '1'
|
||||
):
|
||||
logger.info(f'[{instance.instance_id}] Mutation process completed.')
|
||||
break
|
||||
time.sleep(30)
|
||||
|
||||
assert isinstance(obs, CmdOutputObservation), 'Failed to run mutation script.'
|
||||
mutation_action = CmdRunAction(command=f'cat {log_file}')
|
||||
mutation_action.set_hard_timeout(300)
|
||||
mutation_obs = runtime.run_action(mutation_action)
|
||||
assert isinstance(
|
||||
mutation_obs, CmdOutputObservation
|
||||
), 'Failed to retrieve mutation output.'
|
||||
return mutation_obs.exit_code, mutation_obs.content
|
||||
|
||||
|
||||
def grade_test_output(
|
||||
test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime
|
||||
):
|
||||
"""
|
||||
Two-pass test grading with short-circuiting:
|
||||
1. Run all tests to identify passing/failing tests
|
||||
2. If no failing tests, evaluate coverage immediately
|
||||
3. Otherwise, run only passing tests for coverage analysis
|
||||
"""
|
||||
unit_test_output, coverage_output = '', ''
|
||||
if TESTS_SUFFIX in test_output:
|
||||
unit_test_output = test_output.split(TESTS_SUFFIX)[0]
|
||||
|
||||
if not unit_test_output:
|
||||
return (
|
||||
False,
|
||||
0,
|
||||
'',
|
||||
'',
|
||||
{
|
||||
'total_tests': 0,
|
||||
'passing_tests': 0,
|
||||
'failing_tests': 0,
|
||||
'any_pass': False,
|
||||
'all_pass': False,
|
||||
'passing_test_names': [],
|
||||
'failing_test_names': [],
|
||||
},
|
||||
)
|
||||
|
||||
logger.info('Calling filter unit tests')
|
||||
filtered_content, passing_tests, failing_tests = filter_tests(
|
||||
test_suite, unit_test_output, test_spec.repo
|
||||
)
|
||||
|
||||
total_tests = len(passing_tests) + len(failing_tests)
|
||||
test_stats = {
|
||||
'total_tests': total_tests,
|
||||
'passing_tests': len(passing_tests),
|
||||
'failing_tests': len(failing_tests),
|
||||
'any_pass': len(passing_tests) > 0,
|
||||
'all_pass': len(failing_tests) == 0 and total_tests > 0,
|
||||
'passing_test_names': passing_tests,
|
||||
'failing_test_names': failing_tests,
|
||||
}
|
||||
|
||||
if not passing_tests:
|
||||
return False, 0, unit_test_output, coverage_output, test_stats
|
||||
|
||||
# If all tests pass, evaluate coverage immediately
|
||||
if not failing_tests:
|
||||
coverage = 0
|
||||
cov_success = False
|
||||
if COVERAGE_PREFIX in test_output:
|
||||
coverage_output = test_output.split(COVERAGE_PREFIX)[1]
|
||||
_, coverage = check_coverage(coverage_output, test_spec.code_file)
|
||||
cov_success = True
|
||||
# test_stats['filtered_suite'] = test_suite
|
||||
return cov_success, coverage, unit_test_output, coverage_output, test_stats
|
||||
|
||||
cov_success = False
|
||||
coverage = 0
|
||||
# Second pass - run coverage on passing tests
|
||||
if filtered_content:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
test_suite_path = os.path.join(temp_dir, 'test_suite.py')
|
||||
with open(test_suite_path, 'w') as f:
|
||||
f.write(filtered_content)
|
||||
runtime.copy_to(test_suite_path, '/tmp')
|
||||
|
||||
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}')
|
||||
_, test_output_second_pass, _ = run_tests(runtime, instance, '/tmp/test.sh')
|
||||
|
||||
coverage, coverage_output, unit_test_output = 0, '', test_output_second_pass
|
||||
|
||||
if COVERAGE_PREFIX in test_output_second_pass:
|
||||
coverage_output = test_output_second_pass.split(COVERAGE_PREFIX)[1]
|
||||
unit_test_output = test_output_second_pass.split(TESTS_SUFFIX)[0]
|
||||
_, coverage = check_coverage(coverage_output, test_spec.code_file)
|
||||
cov_success = True
|
||||
|
||||
# test_stats['filtered_suite'] = filtered_content
|
||||
return cov_success, coverage, unit_test_output, coverage_output, test_stats
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
log_dir: str | None = None,
|
||||
) -> EvalOutput:
|
||||
"""
|
||||
Evaluate agent performance on a TestGenEval problem instance.
|
||||
|
||||
Note that this signature differs from the expected input to `run_evaluation`. Use
|
||||
`functools.partial` to provide optional arguments before passing to the evaluation harness.
|
||||
|
||||
Args:
|
||||
log_dir (str | None, default=None): Path to directory where log files will be written. Must
|
||||
be provided if `reset_logger` is set.
|
||||
|
||||
Raises:
|
||||
AssertionError: if the `reset_logger` flag is set without a provided log directory.
|
||||
"""
|
||||
if reset_logger:
|
||||
assert (
|
||||
log_dir is not None
|
||||
), "Can't reset logger without a provided log directory."
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
||||
|
||||
config = get_config(instance)
|
||||
id = instance.instance_id
|
||||
logger.info(f'Starting evaluation for instance {id}.')
|
||||
|
||||
instance['test_result']['id'] = id
|
||||
instance['test_result']['report'] = {
|
||||
'test_output': '',
|
||||
# 'coverage_output': '',
|
||||
# 'mutation_output': '',
|
||||
'empty_generation': False,
|
||||
'error_eval': False,
|
||||
'all_tests_pass': False,
|
||||
'tests_pass': False,
|
||||
'test_timeout': False,
|
||||
'mutation_timeout': False,
|
||||
'coverage_success': False,
|
||||
'mutation_success': False,
|
||||
'coverage': 0,
|
||||
'mutation_score': 0,
|
||||
'mutation_error_interval': -1,
|
||||
'num_mutants': -1,
|
||||
}
|
||||
|
||||
instance['test_result']['lexical'] = {
|
||||
'pred_loc': -1,
|
||||
'gold_loc': -1,
|
||||
'pred_readability': -1,
|
||||
'gold_readability': -1,
|
||||
'pred_methods': -1,
|
||||
'gold_methods': -1,
|
||||
'bleu': -1,
|
||||
'xmatch': -1,
|
||||
'edit_sim': -1,
|
||||
'rouge_f': -1,
|
||||
'rouge_p': -1,
|
||||
'rouge_r': -1,
|
||||
}
|
||||
|
||||
if instance['test_suite'] == '' or instance['test_suite'] is None:
|
||||
instance['test_result']['report']['empty_generation'] = True
|
||||
return EvalOutput(
|
||||
instance_id=instance.instance_id, test_result=instance['test_result']
|
||||
)
|
||||
|
||||
if not args.skip_lexical:
|
||||
lexical_metrics = compute_lexical_metrics(
|
||||
instance['test_suite'], instance['instance']['test_src']
|
||||
)
|
||||
instance['test_result']['lexical'] = lexical_metrics
|
||||
|
||||
test_suite = instance['test_suite']
|
||||
test_spec: TestSpec = instance['test_spec']
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
test_suite_path = os.path.join(temp_dir, 'test_suite.py')
|
||||
with open(test_suite_path, 'w') as f:
|
||||
f.write(test_suite)
|
||||
runtime.copy_to(test_suite_path, '/tmp')
|
||||
|
||||
test_script_path = os.path.join(temp_dir, 'test.sh')
|
||||
with open(test_script_path, 'w') as f:
|
||||
f.write(test_spec.test_script)
|
||||
runtime.copy_to(test_script_path, '/tmp')
|
||||
|
||||
mutation_script_path = os.path.join(temp_dir, 'mutation.sh')
|
||||
with open(mutation_script_path, 'w') as f:
|
||||
f.write(test_spec.mutation_script)
|
||||
runtime.copy_to(mutation_script_path, '/tmp')
|
||||
|
||||
try:
|
||||
run_command(runtime, 'chmod +x /tmp/test.sh /tmp/mutation.sh')
|
||||
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}')
|
||||
|
||||
# First pass - run all tests
|
||||
_, test_output, test_time = run_tests(runtime, instance, '/tmp/test.sh')
|
||||
|
||||
# Grade tests with two-pass approach
|
||||
coverage_success, coverage, unit_test_output, coverage_output, test_stats = (
|
||||
grade_test_output(test_suite, instance, test_output, test_spec, runtime)
|
||||
)
|
||||
|
||||
# Update report with test statistics
|
||||
instance['test_result']['report'].update(
|
||||
{
|
||||
'test_output': unit_test_output,
|
||||
# 'coverage_output': coverage_output,
|
||||
'tests_pass': test_stats['any_pass'], # Changed to use any_pass
|
||||
'all_tests_pass': test_stats['all_pass'], # Added all_pass metric
|
||||
'coverage_success': coverage_success,
|
||||
'coverage': coverage if coverage_success else 0,
|
||||
'test_stats': test_stats,
|
||||
}
|
||||
)
|
||||
|
||||
# Only run mutation testing if we have passing tests and coverage
|
||||
if (
|
||||
not args.skip_mutation
|
||||
and coverage_success
|
||||
and test_stats['any_pass']
|
||||
and coverage > 0
|
||||
):
|
||||
mutation_timeout = max(10, 1.5 * test_time)
|
||||
mutation_toml = MUTATION_TEMPLATE.format(
|
||||
test_cmd=test_spec.test_cmd,
|
||||
source_fp=test_spec.code_file,
|
||||
timeout=mutation_timeout,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
mutation_toml_path = os.path.join(temp_dir, 'mutation.toml')
|
||||
with open(mutation_toml_path, 'w') as f:
|
||||
f.write(mutation_toml)
|
||||
runtime.copy_to(mutation_toml_path, '/tmp')
|
||||
|
||||
run_command(runtime, 'cp /tmp/mutation.toml /testbed/mutation.toml')
|
||||
|
||||
mutation_code, mutation_output = run_mutation_testing(
|
||||
runtime, instance, '/tmp/mutation.sh'
|
||||
)
|
||||
# instance['test_result']['report']['mutation_output'] = mutation_output
|
||||
if mutation_output and mutation_code == 0:
|
||||
(
|
||||
mutation_success,
|
||||
num_mutants,
|
||||
mutation_score,
|
||||
mutation_confidence_interval,
|
||||
) = check_mutation(mutation_output)
|
||||
instance['test_result']['report']['num_mutants'] = num_mutants
|
||||
instance['test_result']['report']['mutation_success'] = mutation_success
|
||||
instance['test_result']['report']['mutation_score'] = mutation_score
|
||||
instance['test_result']['report']['mutation_error_interval'] = (
|
||||
mutation_confidence_interval
|
||||
)
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance.instance_id, test_result=instance['test_result']
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f'Error processing instance {instance.instance_id}: {e}')
|
||||
raise RuntimeError(
|
||||
instance.instance_id,
|
||||
'Unexpected output...',
|
||||
logger,
|
||||
)
|
||||
|
||||
finally:
|
||||
runtime.close()
|
||||
|
||||
|
||||
def count_and_log_fields(evaluated_predictions, fields, key):
|
||||
"""
|
||||
Count and log the sum of specified fields in the evaluated predictions,
|
||||
ignoring fields with a value of -1. If all values for a field are -1,
|
||||
return -1.
|
||||
|
||||
:param evaluated_predictions: DataFrame containing evaluation results
|
||||
:param fields: List of field names to count
|
||||
:param key: Key to access the field values ('report' or 'lexical')
|
||||
"""
|
||||
|
||||
def count_field(row, field):
|
||||
value = row['test_result'][key][field]
|
||||
return (
|
||||
value if value != -1 else None
|
||||
) # Ignore -1 fields by treating them as None
|
||||
|
||||
for field in fields:
|
||||
# Extract the valid values for the field, ignoring -1
|
||||
valid_values = evaluated_predictions.apply(
|
||||
count_field, args=(field,), axis=1
|
||||
).dropna()
|
||||
|
||||
if valid_values.empty: # If all values are -1
|
||||
logger.info(f'# {field}: -1 (All values are -1)')
|
||||
else:
|
||||
count = valid_values.sum() # Sum of valid values
|
||||
length = len(valid_values) # Count of valid entries
|
||||
logger.info(f'# {field}: {length}. ({count / length:.2f})')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
'--input-file', type=str, required=True, help='Path to input predictions file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
default='kjain14/testgeneval',
|
||||
help='Dataset to evaluate on',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--split', type=str, default='test', help='Split to evaluate on'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip_mutation', action='store_true', help='Skip mutation testing'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--skip_lexical', action='store_true', help='Skip lexical metrics'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mutation_timeout',
|
||||
type=int,
|
||||
default=MUTATION_TIMEOUT,
|
||||
help='Mutation timeout',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mutation_buffer',
|
||||
type=int,
|
||||
default=MUTATION_BUFFER,
|
||||
help='Mutation buffer',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
dataset: list[TestGenEvalInstance] = load_testgeneval_dataset(
|
||||
args.dataset, args.split
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
|
||||
)
|
||||
|
||||
# Load predictions
|
||||
assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
|
||||
predictions = pd.read_json(args.input_file, lines=True)
|
||||
assert (
|
||||
'instance_id' in predictions.columns
|
||||
), 'Input file must contain instance_id column.'
|
||||
|
||||
if 'test_suite' not in predictions.columns and (
|
||||
'test_result' in predictions.columns
|
||||
and 'test_suite' in predictions['test_result'].iloc(0)
|
||||
):
|
||||
raise ValueError(
|
||||
'Input file must contain test_suite column OR test_result column with test_suite field.'
|
||||
)
|
||||
|
||||
if 'instance_id_swebench' not in predictions.columns:
|
||||
predictions['instance_id_swebench'] = predictions['instance'].apply(
|
||||
lambda x: x['instance_id_swebench']
|
||||
)
|
||||
|
||||
if 'instance_id' not in predictions.columns and (
|
||||
'instance_id' in predictions['instance'].iloc(0)
|
||||
):
|
||||
raise ValueError(
|
||||
'Input file must contain id column OR instance column with id field.'
|
||||
)
|
||||
|
||||
if 'instance_id' not in predictions.columns:
|
||||
predictions['instance_id'] = predictions['instance'].apply(
|
||||
lambda x: x['instance_id']
|
||||
)
|
||||
|
||||
if 'test_suite' not in predictions.columns:
|
||||
predictions['test_suite'] = predictions['test_result'].apply(
|
||||
lambda x: x['test_suite']
|
||||
)
|
||||
|
||||
assert len(predictions['instance_id'].unique()) == len(
|
||||
predictions
|
||||
), 'instance_id column must be unique.'
|
||||
|
||||
assert {'instance_id_swebench', 'test_suite', 'instance_id'}.issubset(
|
||||
set(predictions.columns)
|
||||
), 'Input file must contain id, instance_id and test_suite columns.'
|
||||
|
||||
predictions['test_spec'] = predictions['instance'].apply(
|
||||
lambda x: make_test_spec(x, args.mutation_timeout, args.mutation_buffer)
|
||||
)
|
||||
|
||||
output_file = args.input_file.replace('.jsonl', '.testgeneval.jsonl')
|
||||
instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
|
||||
|
||||
# If possible, load the relevant metadata to avoid issues with `run_evaluation`.
|
||||
metadata: EvalMetadata | None = None
|
||||
metadata_filepath = os.path.join(os.path.dirname(args.input_file), 'metadata.json')
|
||||
if os.path.exists(metadata_filepath):
|
||||
with open(metadata_filepath, 'r') as metadata_file:
|
||||
data = metadata_file.read()
|
||||
metadata = EvalMetadata.model_validate_json(data)
|
||||
|
||||
# The evaluation harness constrains the signature of `process_instance_func` but we need to
|
||||
# pass extra information. Build a new function object to avoid issues with multiprocessing.
|
||||
process_instance_func = partial(
|
||||
process_instance, log_dir=output_file.replace('.jsonl', '.logs')
|
||||
)
|
||||
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata=None,
|
||||
output_file=output_file,
|
||||
num_workers=args.eval_num_workers,
|
||||
process_instance_func=process_instance_func,
|
||||
)
|
||||
|
||||
# Load evaluated predictions & print number of resolved predictions
|
||||
evaluated_predictions = pd.read_json(output_file, lines=True)
|
||||
report_fields = [
|
||||
'coverage',
|
||||
'mutation_score',
|
||||
'tests_pass',
|
||||
'all_tests_pass',
|
||||
'empty_generation',
|
||||
'coverage_success',
|
||||
'test_timeout',
|
||||
'error_eval',
|
||||
]
|
||||
lexical_fields = [
|
||||
'pred_loc',
|
||||
'gold_loc',
|
||||
'pred_methods',
|
||||
'gold_methods',
|
||||
'bleu',
|
||||
'xmatch',
|
||||
'edit_sim',
|
||||
'rouge_f',
|
||||
'rouge_p',
|
||||
'rouge_r',
|
||||
]
|
||||
|
||||
# Log report and lexical fields
|
||||
count_and_log_fields(evaluated_predictions, report_fields, key='report')
|
||||
count_and_log_fields(evaluated_predictions, lexical_fields, key='lexical')
|
||||
291
evaluation/benchmarks/testgeneval/log_parsers.py
Normal file
291
evaluation/benchmarks/testgeneval/log_parsers.py
Normal file
@@ -0,0 +1,291 @@
|
||||
import re
|
||||
|
||||
from evaluation.benchmarks.testgeneval.constants import TestStatus
|
||||
|
||||
|
||||
def parse_log_pytest(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
test_status_map = {}
|
||||
for line in log.split('\n'):
|
||||
if any([line.startswith(x.value) for x in TestStatus]):
|
||||
# Additional parsing for FAILED status
|
||||
if line.startswith(TestStatus.FAILED.value):
|
||||
line = line.replace(' - ', ' ')
|
||||
test_case = line.split()
|
||||
if len(test_case) <= 1:
|
||||
continue
|
||||
test_status_map[test_case[1]] = test_case[0]
|
||||
return test_status_map
|
||||
|
||||
|
||||
def parse_log_pytest_options(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework with options
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
option_pattern = re.compile(r'(.*?)\[(.*)\]')
|
||||
test_status_map = {}
|
||||
for line in log.split('\n'):
|
||||
if any([line.startswith(x.value) for x in TestStatus]):
|
||||
# Additional parsing for FAILED status
|
||||
if line.startswith(TestStatus.FAILED.value):
|
||||
line = line.replace(' - ', ' ')
|
||||
test_case = line.split()
|
||||
if len(test_case) <= 1:
|
||||
continue
|
||||
has_option = option_pattern.search(test_case[1])
|
||||
if has_option:
|
||||
main, option = has_option.groups()
|
||||
if (
|
||||
option.startswith('/')
|
||||
and not option.startswith('//')
|
||||
and '*' not in option
|
||||
):
|
||||
option = '/' + option.split('/')[-1]
|
||||
test_name = f'{main}[{option}]'
|
||||
else:
|
||||
test_name = test_case[1]
|
||||
test_status_map[test_name] = test_case[0]
|
||||
return test_status_map
|
||||
|
||||
|
||||
def parse_log_django(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Django tester framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
test_status_map = {}
|
||||
lines = log.split('\n')
|
||||
|
||||
prev_test = None
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# This isn't ideal but the test output spans multiple lines
|
||||
if '--version is equivalent to version' in line:
|
||||
test_status_map['--version is equivalent to version'] = (
|
||||
TestStatus.PASSED.value
|
||||
)
|
||||
|
||||
# Log it in case of error
|
||||
if ' ... ' in line:
|
||||
prev_test = line.split(' ... ')[0]
|
||||
|
||||
pass_suffixes = (' ... ok', ' ... OK', ' ... OK')
|
||||
for suffix in pass_suffixes:
|
||||
if line.endswith(suffix):
|
||||
# TODO: Temporary, exclusive fix for django__django-7188
|
||||
# The proper fix should involve somehow getting the test results to
|
||||
# print on a separate line, rather than the same line
|
||||
if line.strip().startswith(
|
||||
'Applying sites.0002_alter_domain_unique...test_no_migrations'
|
||||
):
|
||||
line = line.split('...', 1)[-1].strip()
|
||||
test = line.rsplit(suffix, 1)[0]
|
||||
test_status_map[test] = TestStatus.PASSED.value
|
||||
break
|
||||
if ' ... skipped' in line:
|
||||
test = line.split(' ... skipped')[0]
|
||||
test_status_map[test] = TestStatus.SKIPPED.value
|
||||
if line.endswith(' ... FAIL'):
|
||||
test = line.split(' ... FAIL')[0]
|
||||
test_status_map[test] = TestStatus.FAILED.value
|
||||
if line.startswith('FAIL:'):
|
||||
test = line.split()[1].strip()
|
||||
test_status_map[test] = TestStatus.FAILED.value
|
||||
if line.endswith(' ... ERROR'):
|
||||
test = line.split(' ... ERROR')[0]
|
||||
test_status_map[test] = TestStatus.ERROR.value
|
||||
if line.startswith('ERROR:'):
|
||||
test = line.split()[1].strip()
|
||||
test_status_map[test] = TestStatus.ERROR.value
|
||||
|
||||
if line.lstrip().startswith('ok') and prev_test is not None:
|
||||
# It means the test passed, but there's some additional output (including new lines)
|
||||
# between "..." and "ok" message
|
||||
test = prev_test
|
||||
test_status_map[test] = TestStatus.PASSED.value
|
||||
|
||||
# TODO: This is very brittle, we should do better
|
||||
# There's a bug in the django logger, such that sometimes a test output near the end gets
|
||||
# interrupted by a particular long multiline print statement.
|
||||
# We have observed this in one of 3 forms:
|
||||
# - "{test_name} ... Testing against Django installed in {*} silenced.\nok"
|
||||
# - "{test_name} ... Internal Server Error: \/(.*)\/\nok"
|
||||
# - "{test_name} ... System check identified no issues (0 silenced).\nok"
|
||||
patterns = [
|
||||
r'^(.*?)\s\.\.\.\sTesting\ against\ Django\ installed\ in\ ((?s:.*?))\ silenced\)\.\nok$',
|
||||
r'^(.*?)\s\.\.\.\sInternal\ Server\ Error:\ \/(.*)\/\nok$',
|
||||
r'^(.*?)\s\.\.\.\sSystem check identified no issues \(0 silenced\)\nok$',
|
||||
]
|
||||
for pattern in patterns:
|
||||
for match in re.finditer(pattern, log, re.MULTILINE):
|
||||
test_name = match.group(1)
|
||||
test_status_map[test_name] = TestStatus.PASSED.value
|
||||
return test_status_map
|
||||
|
||||
|
||||
def parse_log_pytest_v2(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework (Later Version)
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
test_status_map = {}
|
||||
escapes = ''.join([chr(char) for char in range(1, 32)])
|
||||
for line in log.split('\n'):
|
||||
line = re.sub(r'\[(\d+)m', '', line)
|
||||
translator = str.maketrans('', '', escapes)
|
||||
line = line.translate(translator)
|
||||
if any([line.startswith(x.value) for x in TestStatus]):
|
||||
if line.startswith(TestStatus.FAILED.value):
|
||||
line = line.replace(' - ', ' ')
|
||||
test_case = line.split()
|
||||
if len(test_case) >= 2:
|
||||
test_status_map[test_case[1]] = test_case[0]
|
||||
# Support older pytest versions by checking if the line ends with the test status
|
||||
elif any([line.endswith(x.value) for x in TestStatus]):
|
||||
test_case = line.split()
|
||||
if len(test_case) >= 2:
|
||||
test_status_map[test_case[0]] = test_case[1]
|
||||
return test_status_map
|
||||
|
||||
|
||||
def parse_log_seaborn(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with seaborn testing framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
test_status_map = {}
|
||||
for line in log.split('\n'):
|
||||
if line.startswith(TestStatus.FAILED.value):
|
||||
test_case = line.split()[1]
|
||||
test_status_map[test_case] = TestStatus.FAILED.value
|
||||
elif f' {TestStatus.PASSED.value} ' in line:
|
||||
parts = line.split()
|
||||
if parts[1] == TestStatus.PASSED.value:
|
||||
test_case = parts[0]
|
||||
test_status_map[test_case] = TestStatus.PASSED.value
|
||||
elif line.startswith(TestStatus.PASSED.value):
|
||||
parts = line.split()
|
||||
test_case = parts[1]
|
||||
test_status_map[test_case] = TestStatus.PASSED.value
|
||||
return test_status_map
|
||||
|
||||
|
||||
def parse_log_sympy(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with Sympy framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
test_status_map = {}
|
||||
pattern = r'(_*) (.*)\.py:(.*) (_*)'
|
||||
matches = re.findall(pattern, log)
|
||||
for match in matches:
|
||||
test_case = f'{match[1]}.py:{match[2]}'
|
||||
test_status_map[test_case] = TestStatus.FAILED.value
|
||||
for line in log.split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('test_'):
|
||||
if line.endswith('[FAIL]') or line.endswith('[OK]'):
|
||||
line = line[: line.rfind('[')]
|
||||
line = line.strip()
|
||||
if line.endswith(' E'):
|
||||
test = line.split()[0]
|
||||
test_status_map[test] = TestStatus.ERROR.value
|
||||
if line.endswith(' F'):
|
||||
test = line.split()[0]
|
||||
test_status_map[test] = TestStatus.FAILED.value
|
||||
if line.endswith(' ok'):
|
||||
test = line.split()[0]
|
||||
test_status_map[test] = TestStatus.PASSED.value
|
||||
return test_status_map
|
||||
|
||||
|
||||
def parse_log_matplotlib(log: str) -> dict[str, str]:
|
||||
"""
|
||||
Parser for test logs generated with PyTest framework
|
||||
|
||||
Args:
|
||||
log (str): log content
|
||||
Returns:
|
||||
dict: test case to test status mapping
|
||||
"""
|
||||
test_status_map = {}
|
||||
for line in log.split('\n'):
|
||||
line = line.replace('MouseButton.LEFT', '1')
|
||||
line = line.replace('MouseButton.RIGHT', '3')
|
||||
if any([line.startswith(x.value) for x in TestStatus]):
|
||||
# Additional parsing for FAILED status
|
||||
if line.startswith(TestStatus.FAILED.value):
|
||||
line = line.replace(' - ', ' ')
|
||||
test_case = line.split()
|
||||
if len(test_case) <= 1:
|
||||
continue
|
||||
test_status_map[test_case[1]] = test_case[0]
|
||||
return test_status_map
|
||||
|
||||
|
||||
parse_log_astroid = parse_log_pytest
|
||||
parse_log_flask = parse_log_pytest
|
||||
parse_log_marshmallow = parse_log_pytest
|
||||
parse_log_pvlib = parse_log_pytest
|
||||
parse_log_pyvista = parse_log_pytest
|
||||
parse_log_sqlfluff = parse_log_pytest
|
||||
parse_log_xarray = parse_log_pytest
|
||||
|
||||
parse_log_pydicom = parse_log_pytest_options
|
||||
parse_log_requests = parse_log_pytest_options
|
||||
parse_log_pylint = parse_log_pytest_options
|
||||
|
||||
parse_log_astropy = parse_log_pytest_v2
|
||||
parse_log_scikit = parse_log_pytest_v2
|
||||
parse_log_sphinx = parse_log_pytest_v2
|
||||
|
||||
|
||||
MAP_REPO_TO_PARSER = {
|
||||
'astropy/astropy': parse_log_astropy,
|
||||
'django/django': parse_log_django,
|
||||
'marshmallow-code/marshmallow': parse_log_marshmallow,
|
||||
'matplotlib/matplotlib': parse_log_matplotlib,
|
||||
'mwaskom/seaborn': parse_log_seaborn,
|
||||
'pallets/flask': parse_log_flask,
|
||||
'psf/requests': parse_log_requests,
|
||||
'pvlib/pvlib-python': parse_log_pvlib,
|
||||
'pydata/xarray': parse_log_xarray,
|
||||
'pydicom/pydicom': parse_log_pydicom,
|
||||
'pylint-dev/astroid': parse_log_astroid,
|
||||
'pylint-dev/pylint': parse_log_pylint,
|
||||
'pytest-dev/pytest': parse_log_pytest,
|
||||
'pyvista/pyvista': parse_log_pyvista,
|
||||
'scikit-learn/scikit-learn': parse_log_scikit,
|
||||
'sqlfluff/sqlfluff': parse_log_sqlfluff,
|
||||
'sphinx-doc/sphinx': parse_log_sphinx,
|
||||
'sympy/sympy': parse_log_sympy,
|
||||
}
|
||||
311
evaluation/benchmarks/testgeneval/metrics.py
Normal file
311
evaluation/benchmarks/testgeneval/metrics.py
Normal file
@@ -0,0 +1,311 @@
|
||||
import sys
|
||||
from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union
|
||||
|
||||
import nltk
|
||||
import numpy as np
|
||||
from fuzzywuzzy import fuzz
|
||||
from rouge import Rouge
|
||||
|
||||
|
||||
|
||||
# increase recursion depth to ensure ROUGE can be calculated for long sentences
|
||||
if sys.getrecursionlimit() < 10_000:
|
||||
sys.setrecursionlimit(10_000)
|
||||
|
||||
def bleu(gold: List[str], pred: List[str]) -> float:
|
||||
"""
|
||||
Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
:param pred: list of predicted tokens
|
||||
:return: BLEU score
|
||||
"""
|
||||
if len(pred) == 0 or len(gold) == 0:
|
||||
return 0.0
|
||||
return 100.0 * nltk.translate.bleu_score.sentence_bleu(
|
||||
[gold],
|
||||
pred,
|
||||
smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
|
||||
auto_reweigh=True,
|
||||
)
|
||||
|
||||
|
||||
def batch_bleu(golds: List[List[str]], preds: List[List[str]]) -> List[float]:
|
||||
"""
|
||||
Calculate BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
:return: list of BLEU scores
|
||||
"""
|
||||
if len(golds) != len(preds):
|
||||
raise ValueError("golds and preds must have the same length")
|
||||
return [bleu(gold, pred) for gold, pred in zip(golds, preds)]
|
||||
|
||||
|
||||
def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:
|
||||
"""
|
||||
Calculate corpus-level BLEU score for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
:return: corpus-level BLEU score
|
||||
"""
|
||||
if len(golds) != len(preds):
|
||||
raise ValueError("golds and preds must have the same length")
|
||||
return 100.0 * nltk.translate.bleu_score.corpus_bleu(
|
||||
[[gold] for gold in golds],
|
||||
preds,
|
||||
smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
|
||||
auto_reweigh=True,
|
||||
)
|
||||
|
||||
|
||||
def edit_sim(
|
||||
gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = " "
|
||||
) -> float:
|
||||
"""
|
||||
Calculate char-level edit similarity, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
:param sep: separator between tokens
|
||||
:return: char-level edit similarity
|
||||
"""
|
||||
if len(pred) == 0 or len(gold) == 0:
|
||||
return 0.0
|
||||
if isinstance(gold, list):
|
||||
gold = sep.join(gold)
|
||||
if isinstance(pred, list):
|
||||
pred = sep.join(pred)
|
||||
return fuzz.ratio(gold, pred)
|
||||
|
||||
|
||||
def batch_edit_sim(
|
||||
golds: List[Union[str, List[str]]],
|
||||
preds: List[Union[str, List[str]]],
|
||||
sep: str = " ",
|
||||
) -> List[float]:
|
||||
"""
|
||||
Calculate char-level edit similarity for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
:param sep: separator between tokens
|
||||
:return: list of char-level edit similarity
|
||||
"""
|
||||
if len(golds) != len(preds):
|
||||
raise ValueError("golds and preds must have the same length")
|
||||
return [edit_sim(gold, pred, sep) for gold, pred in zip(golds, preds)]
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def exact_match(gold: T, pred: T) -> float:
|
||||
"""
|
||||
Calculate exact match accuracy, in the range of {0, 100}.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
:return: exact match accuracy
|
||||
"""
|
||||
if len(pred) == 0 or len(gold) == 0:
|
||||
return 0.0
|
||||
return 100.0 if gold == pred else 0.0
|
||||
|
||||
|
||||
def batch_exact_match(golds: List[T], preds: List[T]) -> List[float]:
|
||||
"""
|
||||
Calculate exact match accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
:return: list of exact match accuracy
|
||||
"""
|
||||
if len(golds) != len(preds):
|
||||
raise ValueError("golds and preds must have the same length")
|
||||
return [exact_match(gold, pred) for gold, pred in zip(golds, preds)]
|
||||
|
||||
|
||||
def rouge_l(
|
||||
gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = " "
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
|
||||
:param gold: gold sentence or list of gold tokens
|
||||
:param pred: predicted sentence or list of predicted tokens
|
||||
:return: {"p": precision, "r": recall, "f": F1}
|
||||
"""
|
||||
if len(pred) == 0 or len(gold) == 0:
|
||||
return {"p": 0.0, "r": 0.0, "f": 0.0}
|
||||
if isinstance(gold, list):
|
||||
gold = sep.join(gold)
|
||||
if isinstance(pred, list):
|
||||
pred = sep.join(pred)
|
||||
try:
|
||||
rouge = Rouge()
|
||||
scores = rouge.get_scores(hyps=pred, refs=gold, avg=True)
|
||||
return {x: scores["rouge-l"][x] * 100.0 for x in ["p", "r", "f"]}
|
||||
except ValueError:
|
||||
return {"p": 0.0, "r": 0.0, "f": 0.0}
|
||||
|
||||
|
||||
def batch_rouge_l(
|
||||
golds: List[Union[str, List[str]]],
|
||||
preds: List[Union[str, List[str]]],
|
||||
sep: str = " ",
|
||||
) -> Dict[str, List[float]]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
:param sep: separator between tokens
|
||||
:return: list of {"p": precision, "r": recall, "f": F1}
|
||||
"""
|
||||
if len(golds) != len(preds):
|
||||
raise ValueError("golds and preds must have the same length")
|
||||
scores = [rouge_l(gold, pred, sep) for gold, pred in zip(golds, preds)]
|
||||
return {x: [score[x] for score in scores] for x in ["p", "r", "f"]}
|
||||
|
||||
|
||||
def accuracy(
|
||||
gold: List[str],
|
||||
pred: List[str],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate token-level accuracy, in the range of 0~100.
|
||||
If gold and pred are not the same length, the longer one would be truncated.
|
||||
|
||||
:param gold: list of gold tokens
|
||||
:param pred: list of predicted tokens
|
||||
:param ignore: list of (gold) tokens to ignore
|
||||
:return: accuracy
|
||||
"""
|
||||
if len(pred) == 0 or len(gold) == 0:
|
||||
return 0.0
|
||||
if ignore is None:
|
||||
ignore = []
|
||||
i = 0
|
||||
total = 0
|
||||
match = 0
|
||||
while i < len(gold) and i < len(pred):
|
||||
if gold[i] in ignore:
|
||||
i += 1
|
||||
continue
|
||||
total += 1
|
||||
if gold[i] == pred[i]:
|
||||
match += 1
|
||||
i += 1
|
||||
|
||||
if total == 0:
|
||||
return 0.0
|
||||
return 100.0 * match / total
|
||||
|
||||
|
||||
def batch_accuracy(
|
||||
golds: List[List[str]],
|
||||
preds: List[List[str]],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> List[float]:
|
||||
"""
|
||||
Calculate token-level accuracy for a batch of sentences.
|
||||
|
||||
:param golds: list of gold sentences
|
||||
:param preds: list of predicted sentences
|
||||
:param ignore: list of (gold) tokens to ignore
|
||||
:return: list of accuracy
|
||||
"""
|
||||
if len(golds) != len(preds):
|
||||
raise ValueError("golds and preds must have the same length")
|
||||
return [accuracy(gold, pred, ignore) for gold, pred in zip(golds, preds)]
|
||||
|
||||
|
||||
def first_match_to_topk(
|
||||
first_match_list: List[int], k_values: List[int]
|
||||
) -> Dict[int, List[float]]:
|
||||
"""
|
||||
Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
|
||||
:param first_match: first match ranks (1-indexed)
|
||||
:param k_values: k values to consider
|
||||
:return: a mapping from k to top-k accuracies (ranging from 0~100)
|
||||
"""
|
||||
return {k: [100.0 if x <= k else 0.0 for x in first_match_list] for k in k_values}
|
||||
|
||||
|
||||
def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
"""
|
||||
Sample pass@k metric according to the Codex paper, but in the scale of 0~100.
|
||||
:param n: total number of samples
|
||||
:param c: number of correct samples
|
||||
:param k: k in pass@$k$
|
||||
"""
|
||||
if n < k or (n - c) < k:
|
||||
# fallback to the (1 - (1-p)^k) formula
|
||||
return (1 - (1 - (c / n)) ** k) * 100
|
||||
else:
|
||||
return (1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)).item()) * 100
|
||||
|
||||
|
||||
def self_bleu(samples: List[List[str]]) -> float:
|
||||
"""
|
||||
Calculate self-BLEU among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:return: self-BLEU
|
||||
"""
|
||||
if len(samples) == 0:
|
||||
return 100.0
|
||||
|
||||
scores = []
|
||||
for i in range(len(samples)):
|
||||
scores.append(
|
||||
100.0
|
||||
* nltk.translate.bleu_score.sentence_bleu(
|
||||
[samples[j] for j in range(len(samples)) if j != i],
|
||||
samples[i],
|
||||
smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
|
||||
auto_reweigh=True,
|
||||
)
|
||||
)
|
||||
return np.mean(scores).item()
|
||||
|
||||
|
||||
def self_edit_distance(samples: List[Union[str, List[str]]], sep=" ") -> float:
|
||||
"""
|
||||
Calculate self-edit-distance among the samples.
|
||||
:param samples: the chosen m samples
|
||||
:param sep: the separator between tokens
|
||||
:return: self-edit-distance
|
||||
"""
|
||||
if len(samples) == 0:
|
||||
return 0.0
|
||||
|
||||
scores = []
|
||||
for i in range(len(samples)):
|
||||
sample_i = samples[i]
|
||||
if not isinstance(sample_i, str):
|
||||
sample_i = sep.join(sample_i)
|
||||
for j in range(len(samples)):
|
||||
if i == j:
|
||||
continue
|
||||
sample_j = samples[j]
|
||||
if not isinstance(sample_j, str):
|
||||
sample_j = sep.join(sample_j)
|
||||
|
||||
scores.append(100 - fuzz.ratio(sample_i, sample_j))
|
||||
return np.mean(scores).item()
|
||||
|
||||
|
||||
|
||||
QUALITY_METRICS: Dict[str, Callable[[List[str], List[str]], float]] = {
|
||||
"bleu": bleu,
|
||||
"xmatch": exact_match,
|
||||
"edit-sim": edit_sim,
|
||||
"rouge-f": lambda g, p: rouge_l(g, p)["f"],
|
||||
"rouge-p": lambda g, p: rouge_l(g, p)["p"],
|
||||
"rouge-r": lambda g, p: rouge_l(g, p)["r"],
|
||||
}
|
||||
114
evaluation/benchmarks/testgeneval/prompt.py
Normal file
114
evaluation/benchmarks/testgeneval/prompt.py
Normal file
@@ -0,0 +1,114 @@
|
||||
CODEACT_TESTGEN_PROMPT_OLD = """Your goal is to generate a high-quality test suite (at least 20+ passing tests) for the code file: {code_file}. Output the test suite at {test_file}\n'
|
||||
|
||||
[current directory: /workspace/{workspace_dir_name}]
|
||||
|
||||
IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP
|
||||
|
||||
IMPORTANT: Follow instructions, if you have < 80 tests you should generate more tests rather than trying to fix the ones you have.
|
||||
|
||||
IMPORTANT: Code file to test:
|
||||
```python
|
||||
{code_src}
|
||||
```
|
||||
|
||||
Here are additional imports that you may need:
|
||||
{imports}
|
||||
|
||||
Look at code dependencies (NOT {code_file} since you already have contents) and test files you need context for to write a complete test suite.
|
||||
|
||||
Aim for 20+ test functions with asserts. Do not hestitate to use the Python interpreter to understand the input output behavior of the code you are testing.
|
||||
|
||||
Output your test suite at {test_file}. Each unit test must be a function starting with test_. Include all your test imports and setup before your first test. Do not include a main method to run the tests. Make sure to make it as comprehensive as possible, try to execute all the methods you saw.
|
||||
|
||||
When you think you've successfully generated a test suite, run it on for the current project using {coverage_command}.
|
||||
|
||||
If you have few tests GENERATE MORE TESTS rather than trying to fix the ones you have (it is possible to filter out failing tests later).
|
||||
|
||||
Then run coverage report -m --include {code_file} to see how well your test suite covers the code under test.
|
||||
|
||||
When you are trying to improve coverage pick a part of the code that is not covered (indicated by lines on coverage report), examine the code and then
|
||||
try to generate a test for it. Feel free to use a code interpreter to understand the input output behavior. ONLY add tests
|
||||
not remove them.
|
||||
|
||||
If you are unable to see passing and failing tests, FIX YOUR IMPORTS to use the same style as other test files.
|
||||
|
||||
You should NOT modify any existing test case files. You SHOULD add new test in a NEW file to reproduce the issue.
|
||||
|
||||
You should NEVER use web browsing or any other web-based tools.
|
||||
|
||||
You should NEVER install new packages, use existing packages only.
|
||||
|
||||
You should ALWAYS use the default Python interpreter available in the <execute_bash> environment to run code related to the provided issue and/or repository.
|
||||
|
||||
You should ALWAYS use local imports DO NOT import the general library.
|
||||
|
||||
When you think you have a fully adequate test suite, please run the following command: <execute_bash> exit </execute_bash>.
|
||||
"""
|
||||
|
||||
CODEACT_TESTGEN_PROMPT = """
|
||||
Your goal is to generate a comprehensive, **broad-coverage** test suite for the code below, ensuring you test as many lines and branches as possible on the first attempt.
|
||||
|
||||
Place your test suite in a new file named {test_file}.
|
||||
|
||||
IMPORTANT REQUIREMENTS:
|
||||
1. **No external help or resources**—use only the snippet below.
|
||||
2. **Focus on breadth over depth**: cover all major functions, classes, and code paths early to minimize coverage iterations.
|
||||
3. Each test function must start with `test_` and use `assert` to verify behavior.
|
||||
4. Include only necessary imports (standard library or local).
|
||||
5. Do **not** modify existing test files—create a brand new one. No `main()` or other non-test code.
|
||||
6. Produce **at least 20 test functions**; if coverage is lacking, add more tests rather than removing or changing existing ones.
|
||||
7. Use the following commands to check coverage:
|
||||
<execute_bash> {coverage_command} </execute_bash>
|
||||
<execute_bash> coverage report -m --include {code_file} </execute_bash>
|
||||
If lines remain uncovered, add new tests targeting them specifically.
|
||||
8. When you're satisfied with coverage, finalize by running:
|
||||
<execute_bash> exit </execute_bash>
|
||||
|
||||
Below is the **complete code snippet** to test:
|
||||
|
||||
<START_OF_CODE>
|
||||
{code_src}
|
||||
<END_OF_CODE>
|
||||
|
||||
NOTE: if you are testing django, you must use from django.test import SimpleTestCase and class based tests (i.e. class TestSomething(SimpleTestCase)).
|
||||
NOTE: if there is an error executing tests you MUST fix it before exiting. DO NOT install new packages.
|
||||
NOTE: if outputting a revised test suite REPLACE {test_file} with the revised suite
|
||||
|
||||
**Output the final test suite** (20+ tests) for {test_file} in a single code block, no extra commentary. MAKE SURE you run the tests and ensure you can see which tests passed and failed BEFORE exiting.
|
||||
"""
|
||||
|
||||
CODEACT_TESTGEN_PROMPT_ITERATE = """
|
||||
Your goal is to improve the test suite at {test_file} to achieve **broad-coverage** of the code below.
|
||||
|
||||
First run the test suite.
|
||||
|
||||
If no tests run, then remove {test_file} and create {test_file} with a new suite.
|
||||
|
||||
Otherwise, improve it aiming to improve code coverage.
|
||||
|
||||
IMPORTANT REQUIREMENTS:
|
||||
1. Use the following commands to check coverage (RUN THIS FIRST):
|
||||
<execute_bash> {coverage_command} </execute_bash>
|
||||
<execute_bash> coverage report -m --include {code_file} </execute_bash>
|
||||
If lines remain uncovered, add new tests targeting them specifically.
|
||||
2. **No external help or resources**—use only the snippet below.
|
||||
3. **Focus on breadth over depth**: cover all major functions, classes, and code paths early to minimize coverage iterations.
|
||||
4. Each test function must use `assert` to verify behavior.
|
||||
5. Include only necessary imports (standard library or local).
|
||||
6. Do **not** modify other test files in the repository. No `main()` or other non-test code.
|
||||
7. Produce **at least 20 test functions**; if coverage is lacking, add more tests rather than removing or changing existing ones.
|
||||
8. When you're satisfied with coverage, finalize by running:
|
||||
<execute_bash> exit </execute_bash>
|
||||
|
||||
Below is the **complete code snippet** to test:
|
||||
|
||||
<START_OF_CODE>
|
||||
{code_src}
|
||||
<END_OF_CODE>
|
||||
|
||||
NOTE: if you are testing django, you must use from django.test import SimpleTestCase and class based tests (i.e. class TestSomething(SimpleTestCase)).
|
||||
NOTE: if there is an error executing tests you MUST fix it before exiting. DO NOT install new packages.
|
||||
NOTE: if outputting a revised test suite REPLACE {test_file} with the revised suite
|
||||
|
||||
**Output the final test suite** (20+ tests) for {test_file} in a single code block, no extra commentary. MAKE SURE you run the tests and ensure you can see which tests passed and failed BEFORE exiting.
|
||||
"""
|
||||
31
evaluation/benchmarks/testgeneval/pygments_utils.py
Normal file
31
evaluation/benchmarks/testgeneval/pygments_utils.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import re
|
||||
from pygments.lexers.python import PythonLexer
|
||||
|
||||
def tokenize_code(code):
|
||||
lexer = PythonLexer()
|
||||
tokens = process_pygments_tokens(lexer.get_tokens(code))
|
||||
return tokens
|
||||
|
||||
def process_pygments_tokens(tokens):
|
||||
new_tokens = []
|
||||
|
||||
for token in tokens:
|
||||
if str(token[0]) == "Token.Text" and re.match(r'\s+', token[1]) or str(token[0]) == "Token.Text.Whitespace":
|
||||
continue
|
||||
new_tokens.append(token[1])
|
||||
|
||||
new_tokens_final = []
|
||||
i = 0
|
||||
while i < len(new_tokens)-2:
|
||||
if new_tokens[i] == '"' and new_tokens[i+1]=='STR' and new_tokens[i+2] == '"':
|
||||
new_tokens_final.append("\"STR\"")
|
||||
i = i + 3
|
||||
else:
|
||||
new_tokens_final.append(new_tokens[i])
|
||||
i = i + 1
|
||||
|
||||
for i in range(len(new_tokens)-2, len(new_tokens)):
|
||||
if i >= 0:
|
||||
new_tokens_final.append(new_tokens[i])
|
||||
|
||||
return new_tokens_final
|
||||
58
evaluation/benchmarks/testgeneval/report_utils.py
Normal file
58
evaluation/benchmarks/testgeneval/report_utils.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def check_coverage(coverage_output, code_file):
|
||||
json_cov = json.loads(coverage_output)
|
||||
if code_file in json_cov['files'].keys():
|
||||
file_data = json_cov['files'][code_file]
|
||||
return True, file_data['summary']['percent_covered']
|
||||
|
||||
return False, 0
|
||||
|
||||
|
||||
def check_mutation(mutation_output):
|
||||
if 'total jobs: ' in mutation_output:
|
||||
num_mutants = int(mutation_output.split('total jobs: ')[1].split('\n')[0])
|
||||
final_conf = mutation_output.split('\n')[-1]
|
||||
if len(final_conf.strip().split(' ')) == 3:
|
||||
low, val, high = final_conf.split(' ')
|
||||
low = float(low)
|
||||
val = float(val)
|
||||
high = float(high)
|
||||
|
||||
confidence_range = high - val
|
||||
mutation_score = 100 - val
|
||||
|
||||
return True, num_mutants, mutation_score, confidence_range
|
||||
|
||||
return False, -1, 0, -1
|
||||
|
||||
|
||||
def count_methods(code_str):
|
||||
"""
|
||||
Counts the number of methods/functions in a given string of code.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
|
||||
Returns:
|
||||
int: The number of methods/functions found.
|
||||
"""
|
||||
# Regular expression to find Python function definitions
|
||||
pattern = r'\bdef\b\s+\w+\s*\('
|
||||
matches = re.findall(pattern, code_str)
|
||||
return len(matches)
|
||||
|
||||
|
||||
def get_lines_of_code(code_str):
|
||||
"""
|
||||
Extracts lines of code from a given string.
|
||||
|
||||
Args:
|
||||
code_str (str): A string containing code.
|
||||
|
||||
Returns:
|
||||
list: A list of lines of code.
|
||||
"""
|
||||
return len(code_str.strip().split('\n'))
|
||||
578
evaluation/benchmarks/testgeneval/run_infer.py
Normal file
578
evaluation/benchmarks/testgeneval/run_infer.py
Normal file
@@ -0,0 +1,578 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import toml
|
||||
from datasets import load_dataset
|
||||
|
||||
import openhands.agenthub
|
||||
from evaluation.benchmarks.testgeneval.constants import MAP_REPO_VERSION_TO_SPECS
|
||||
from evaluation.benchmarks.testgeneval.prompt import (
|
||||
CODEACT_TESTGEN_PROMPT,
|
||||
CODEACT_TESTGEN_PROMPT_ITERATE,
|
||||
)
|
||||
from evaluation.benchmarks.testgeneval.utils import get_test_directives
|
||||
from evaluation.utils.shared import (
|
||||
EvalException,
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
assert_and_raise,
|
||||
codeact_user_response,
|
||||
get_metrics,
|
||||
is_fatal_evaluation_error,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
update_llm_config_for_completions_logging,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
AgentConfig,
|
||||
AppConfig,
|
||||
SandboxConfig,
|
||||
get_llm_config_arg,
|
||||
get_parser,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
from openhands.events.action import CmdRunAction, MessageAction
|
||||
from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
||||
from openhands.events.serialization.event import event_to_dict
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
}
|
||||
|
||||
|
||||
def _preprocess_instance(d):
|
||||
for key, value in d.items():
|
||||
if isinstance(value, np.ndarray):
|
||||
d[key] = value.tolist()
|
||||
return d
|
||||
|
||||
|
||||
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
||||
return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
||||
|
||||
|
||||
def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
# workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
# Prepare instruction
|
||||
coverage_command = ' '.join(
|
||||
[
|
||||
MAP_REPO_VERSION_TO_SPECS[instance['repo']][instance['version']][
|
||||
'test_cmd'
|
||||
],
|
||||
*get_test_directives(instance),
|
||||
]
|
||||
)
|
||||
|
||||
# Testing general agents
|
||||
prompt_to_use = (
|
||||
CODEACT_TESTGEN_PROMPT_ITERATE
|
||||
if instance['full_pred'] is not None
|
||||
else CODEACT_TESTGEN_PROMPT
|
||||
)
|
||||
instruction = prompt_to_use.format(
|
||||
code_file=os.path.join('/testbed', instance.code_file),
|
||||
test_file=os.path.join('/testbed', instance.test_file),
|
||||
coverage_command=coverage_command,
|
||||
code_src=instance['code_src'],
|
||||
imports='\n'.join(instance.local_imports),
|
||||
workspace_dir_name=_get_swebench_workspace_dir_name(instance),
|
||||
)
|
||||
|
||||
if RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\n'
|
||||
'You SHOULD NEVER attempt to browse the web. '
|
||||
'</IMPORTANT!>\n'
|
||||
)
|
||||
|
||||
return instruction
|
||||
|
||||
|
||||
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
||||
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/kdjain/')
|
||||
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
|
||||
|
||||
|
||||
def get_instance_docker_image(instance_id: str) -> str:
|
||||
image_name = 'sweb.eval.x86_64.' + instance_id
|
||||
image_name = image_name.replace(
|
||||
'__', '_s_'
|
||||
) # to comply with docker image naming convention
|
||||
return DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name
|
||||
|
||||
|
||||
def get_config(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
) -> AppConfig:
|
||||
# We use a different instance image for the each instance of TestGenEval
|
||||
base_container_image = get_instance_docker_image(instance['instance_id_swebench'])
|
||||
logger.info(
|
||||
f'Using instance container image: {base_container_image}. '
|
||||
f'Please make sure this image exists. '
|
||||
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
||||
)
|
||||
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
run_as_openhands=False,
|
||||
max_iterations=metadata.max_iterations,
|
||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image=base_container_image,
|
||||
enable_auto_lint=True,
|
||||
use_host_network=False,
|
||||
# large enough timeout, since some testcases take very long to run
|
||||
timeout=300,
|
||||
# Add platform to the sandbox config to solve issue 4401
|
||||
platform='linux/amd64',
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get(
|
||||
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
||||
),
|
||||
keep_runtime_alive=False,
|
||||
remote_runtime_init_timeout=3600,
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
config.set_llm_config(
|
||||
update_llm_config_for_completions_logging(
|
||||
metadata.llm_config, metadata.eval_output_dir, instance['id']
|
||||
)
|
||||
)
|
||||
agent_config = AgentConfig(
|
||||
codeact_enable_jupyter=False,
|
||||
codeact_enable_browsing=RUN_WITH_BROWSING,
|
||||
codeact_enable_llm_editor=False,
|
||||
condenser=metadata.condenser_config,
|
||||
enable_prompt_extensions=False,
|
||||
)
|
||||
config.set_agent_config(agent_config)
|
||||
return config
|
||||
|
||||
|
||||
def initialize_runtime(
|
||||
runtime: Runtime,
|
||||
instance: pd.Series, # this argument is not required
|
||||
):
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info('-' * 30)
|
||||
logger.info('BEGIN Runtime Initialization Fn')
|
||||
logger.info('-' * 30)
|
||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
obs: CmdOutputObservation
|
||||
|
||||
instance['instance_id'] = instance['instance_id_swebench']
|
||||
|
||||
# Set instance id
|
||||
action = CmdRunAction(
|
||||
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id_swebench']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}'
|
||||
)
|
||||
|
||||
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
|
||||
|
||||
# inject the init script
|
||||
script_dir = os.path.dirname(__file__)
|
||||
|
||||
# inject the instance info
|
||||
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
|
||||
)
|
||||
|
||||
swe_instance_json_name = 'swe-bench-instance.json'
|
||||
swe_prediction = 'test_suite.py'
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Construct the full path for the desired file name within the temporary directory
|
||||
temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
|
||||
# Write to the file with the desired name within the temporary directory
|
||||
with open(temp_file_path, 'w') as f:
|
||||
if not isinstance(instance, dict):
|
||||
preprocessed_instance = _preprocess_instance(instance.to_dict())
|
||||
json.dump([preprocessed_instance], f)
|
||||
else:
|
||||
preprocessed_instance = _preprocess_instance(instance)
|
||||
json.dump([preprocessed_instance], f)
|
||||
|
||||
# Copy the file to the desired location
|
||||
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
|
||||
|
||||
if instance['full_pred'] is not None:
|
||||
temp_file_path_pred = os.path.join(temp_dir, swe_prediction)
|
||||
with open(temp_file_path_pred, 'w') as f:
|
||||
f.write(instance['full_pred'])
|
||||
|
||||
runtime.copy_to(temp_file_path_pred, '/tmp')
|
||||
|
||||
# Copy the file to the desired location
|
||||
action = CmdRunAction(
|
||||
command=f"cp /tmp/test_suite.py /testbed/{instance['test_file']}"
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0, f'Failed to copy test file: {str(obs)}'
|
||||
)
|
||||
|
||||
action = CmdRunAction(
|
||||
command='git -C /testbed add . && git -C /testbed commit -m "Add test file"'
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
||||
|
||||
# inject the instance swe entry
|
||||
runtime.copy_to(
|
||||
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
|
||||
'/swe_util/',
|
||||
)
|
||||
action = CmdRunAction(command='cat ~/.bashrc')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(command='source ~/.bashrc')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
if isinstance(obs, ErrorObservation):
|
||||
logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command='git reset --hard')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(
|
||||
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
||||
|
||||
logger.info('-' * 30)
|
||||
logger.info('END Runtime Initialization Fn')
|
||||
logger.info('-' * 30)
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
runtime: Runtime,
|
||||
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
|
||||
) -> dict[str, Any]:
|
||||
"""Complete the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
try:
|
||||
logger.info('-' * 30)
|
||||
logger.info('BEGIN Runtime Completion Fn')
|
||||
logger.info('-' * 30)
|
||||
obs: CmdOutputObservation
|
||||
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
||||
|
||||
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
||||
)
|
||||
|
||||
action = CmdRunAction(command=f'cat {instance.test_file}')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0,
|
||||
f'Failed to find file: {instance.test_file} in /workspace/{workspace_dir_name}',
|
||||
)
|
||||
|
||||
test_suite = obs.content.strip()
|
||||
except Exception:
|
||||
# Print stack trace
|
||||
print('Skipping, exception in complete_runtime')
|
||||
print(traceback.format_exc())
|
||||
test_suite = instance['full_pred'] if instance['full_pred'] is not None else ''
|
||||
|
||||
# action = CmdRunAction(command='git add -A')
|
||||
# action.set_hard_timeout(600)
|
||||
# logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
# obs = runtime.run_action(action)
|
||||
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
# assert_and_raise(obs.exit_code == 0, f'Failed to git add -A: {str(obs)}')
|
||||
|
||||
logger.info('-' * 30)
|
||||
logger.info('END Runtime Completion Fn')
|
||||
logger.info('-' * 30)
|
||||
return {
|
||||
'test_suite': test_suite,
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
) -> EvalOutput:
|
||||
config = get_config(instance, metadata)
|
||||
start_time = time.time() # Track start time
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
||||
reset_logger_for_multiprocessing(logger, instance.id, log_dir)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {instance.id}.')
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
try:
|
||||
initialize_runtime(runtime, instance)
|
||||
|
||||
instruction = get_instruction(instance, metadata)
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
initial_user_action=MessageAction(content=instruction),
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
metadata.agent_class
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
# if fatal error, throw EvalError to trigger re-run
|
||||
if is_fatal_evaluation_error(state.last_error):
|
||||
raise EvalException('Fatal error detected: ' + state.last_error)
|
||||
|
||||
# ======= THIS IS SWE-Bench specific =======
|
||||
return_val = complete_runtime(runtime, instance)
|
||||
test_suite = return_val['test_suite']
|
||||
logger.info(
|
||||
f'Got test suite for instance {instance.instance_id}:\n--------\n{test_suite}\n--------'
|
||||
)
|
||||
finally:
|
||||
runtime.close()
|
||||
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
logger.info(
|
||||
f'Evaluation for instance {instance.instance_id} took {elapsed_time:.2f} seconds.'
|
||||
)
|
||||
|
||||
# ==========================================
|
||||
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
# we use eval_infer.sh to evaluate the agent's edits, not here
|
||||
# because the agent may alter the environment / testcases
|
||||
test_result = {
|
||||
'test_suite': test_suite,
|
||||
'elapsed_time': elapsed_time,
|
||||
}
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
histories = [event_to_dict(event) for event in state.history]
|
||||
metrics = get_metrics(state)
|
||||
|
||||
# Save the output
|
||||
output = EvalOutput(
|
||||
instance_id=instance.id,
|
||||
instruction=instruction,
|
||||
instance=_preprocess_instance(instance.to_dict()), # SWE Bench specific
|
||||
test_result=test_result,
|
||||
metadata=metadata,
|
||||
history=histories,
|
||||
metrics=metrics,
|
||||
error=state.last_error if state and state.last_error else None,
|
||||
)
|
||||
# print(output)
|
||||
return output
|
||||
|
||||
|
||||
def prepare_dataset_pre(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
||||
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
data = toml.load(file)
|
||||
if 'selected_ids' in data:
|
||||
selected_ids = data['selected_ids']
|
||||
logger.info(
|
||||
f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
|
||||
)
|
||||
subset = dataset[dataset[filter_column].isin(selected_ids)]
|
||||
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
||||
|
||||
subset['instance_id_swebench'] = subset['instance_id']
|
||||
subset['instance_id'] = subset['id']
|
||||
return subset
|
||||
|
||||
dataset['instance_id_swebench'] = dataset['instance_id']
|
||||
dataset['instance_id'] = dataset['id']
|
||||
return dataset
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
default='kjain/testgenevallite',
|
||||
help='data set to evaluate on, either full-test or lite-test',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--split',
|
||||
type=str,
|
||||
default='test',
|
||||
help='split to evaluate on',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--testfile_start',
|
||||
action='store_true',
|
||||
help='Whether to start from the 0 shot test file',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--zero_shot_path',
|
||||
type=str,
|
||||
help='Path to the zero shot test file predictions',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if args.testfile_start and not args.zero_shot_path:
|
||||
raise ValueError(
|
||||
'If you want to start from the 0 shot test file, you must provide the path to the zero shot test file predictions'
|
||||
)
|
||||
|
||||
preds_map = {}
|
||||
if args.testfile_start:
|
||||
with open(args.zero_shot_path, 'r') as f:
|
||||
for line in f:
|
||||
pred = json.loads(line)
|
||||
preds_map[pred['id']] = pred['preds']['full'][0]
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenHands's repo
|
||||
dataset = load_dataset(args.dataset, split=args.split)
|
||||
logger.info(f'Loaded dataset {args.dataset} with split {args.split}')
|
||||
testgeneval_filepairs = prepare_dataset_pre(dataset.to_pandas(), 'id')
|
||||
|
||||
llm_config = None
|
||||
if args.llm_config:
|
||||
llm_config = get_llm_config_arg(args.llm_config)
|
||||
llm_config.log_completions = True
|
||||
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
||||
llm_config.modify_params = False
|
||||
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
||||
|
||||
details = {}
|
||||
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
||||
|
||||
dataset_descrption = (
|
||||
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
|
||||
)
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
dataset_descrption,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
details=details,
|
||||
)
|
||||
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(testgeneval_filepairs, output_file, args.eval_n_limit)
|
||||
|
||||
if not instances.empty:
|
||||
instances['full_pred'] = (
|
||||
instances['instance_id']
|
||||
.map(preds_map)
|
||||
.apply(lambda x: x if pd.notna(x) else None)
|
||||
)
|
||||
|
||||
run_evaluation(
|
||||
instances, metadata, output_file, args.eval_num_workers, process_instance
|
||||
)
|
||||
@@ -0,0 +1,128 @@
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
# Function to run shell commands
|
||||
def run_command(command):
|
||||
try:
|
||||
subprocess.run(command, check=True, shell=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f'An error occurred: {e}')
|
||||
|
||||
|
||||
# Function to log in to Docker Hub
|
||||
def docker_login():
|
||||
print('Logging into Docker Hub...')
|
||||
run_command('docker login')
|
||||
|
||||
|
||||
# Function to generate Dockerfile content based on image type
|
||||
def generate_dockerfile_content(
|
||||
base_image, dependencies, datum, patch_path, test_patch_path
|
||||
):
|
||||
dockerfile_content = f"""
|
||||
FROM {base_image}
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
RUN source /opt/miniconda3/bin/activate && conda activate testbed && pip install {' '.join(dependencies)}
|
||||
COPY {patch_path} /app/patch.diff
|
||||
RUN git apply /app/patch.diff
|
||||
RUN rm /app/patch.diff
|
||||
COPY {test_patch_path} /app/patch.diff
|
||||
RUN git apply /app/patch.diff
|
||||
RUN git config --global user.email ""
|
||||
RUN git config --global user.name "TestGenEval"
|
||||
RUN rm /app/patch.diff
|
||||
RUN rm {datum['test_file']}
|
||||
"""
|
||||
|
||||
# Add specific content based on image type
|
||||
dockerfile_content += 'RUN git add .\nRUN git commit -m "Testing fixes"'
|
||||
|
||||
return dockerfile_content
|
||||
|
||||
|
||||
# Function to build, push, and clean up Docker images
|
||||
def build_and_push_image(dockerfile_content, image_name):
|
||||
with open('Dockerfile.temp', 'w') as dockerfile:
|
||||
dockerfile.write(dockerfile_content)
|
||||
run_command(f'docker build -f Dockerfile.temp -t {image_name} .')
|
||||
run_command(f'docker push {image_name}')
|
||||
run_command(f'docker rmi {image_name}')
|
||||
os.remove('Dockerfile.temp')
|
||||
|
||||
|
||||
# Function to process images with .eval in the name
|
||||
def process_images(dataset, original_namespace, new_namespace, start_instance_id):
|
||||
dependencies = ['coverage', 'cosmic-ray']
|
||||
|
||||
found_start = len(start_instance_id) == 0
|
||||
for datum in dataset:
|
||||
if not found_start and datum['instance_id'] == start_instance_id:
|
||||
found_start = True
|
||||
elif found_start:
|
||||
full_image_name = f'{original_namespace}/sweb.eval.x86_64.{datum["instance_id"].replace("__", "_s_")}:latest'
|
||||
print(f'Processing image: {full_image_name}')
|
||||
run_command(f'docker pull {full_image_name}')
|
||||
|
||||
# Save patches and preds_context to regular files
|
||||
patch_file_path = 'patch.diff'
|
||||
test_patch_file_path = 'test_patch.diff'
|
||||
|
||||
with open(patch_file_path, 'w') as patch_file, open(
|
||||
test_patch_file_path, 'w'
|
||||
) as test_patch_file:
|
||||
patch_file.write(datum['patch'])
|
||||
test_patch_file.write(datum['test_patch'])
|
||||
|
||||
# Define image types and corresponding tags
|
||||
new_image_name = f'{new_namespace}/sweb.eval.x86_64.{datum["instance_id"].replace("__", "_s_")}:latest'
|
||||
dockerfile_content = generate_dockerfile_content(
|
||||
full_image_name,
|
||||
dependencies,
|
||||
datum,
|
||||
patch_file_path,
|
||||
test_patch_file_path,
|
||||
)
|
||||
build_and_push_image(dockerfile_content, new_image_name)
|
||||
|
||||
# Cleanup regular files and images
|
||||
os.remove(patch_file_path)
|
||||
os.remove(test_patch_file_path)
|
||||
run_command(f'docker rmi {full_image_name}')
|
||||
run_command('docker system prune -f') # Clean up dangling resources
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Process Docker images with .eval in the name.'
|
||||
)
|
||||
parser.add_argument('--dataset', type=str, default='kjain14/testgeneval')
|
||||
parser.add_argument('--split', type=str, default='test')
|
||||
parser.add_argument(
|
||||
'--new_namespace',
|
||||
type=str,
|
||||
default='kdjain',
|
||||
help='The new Docker Hub namespace to push the images',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--original_namespace',
|
||||
type=str,
|
||||
default='xingyaoww',
|
||||
help='The original Docker Hub namespace',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--start_instance_id',
|
||||
type=str,
|
||||
default='',
|
||||
help='The instance_id to start processing from',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
dataset = load_dataset(args.dataset)[args.split]
|
||||
|
||||
docker_login()
|
||||
process_images(
|
||||
dataset, args.original_namespace, args.new_namespace, args.start_instance_id
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,196 @@
|
||||
sweb.base.x86_64:latest
|
||||
sweb.env.x86_64.088a7e628bda9770f9757b:latest
|
||||
sweb.env.x86_64.0d80c7dec81ee2f2f513e2:latest
|
||||
sweb.env.x86_64.0f99bce2750f3109957bec:latest
|
||||
sweb.env.x86_64.1b3b218535da0abf4469cb:latest
|
||||
sweb.env.x86_64.1c1a6945f732f9391228c5:latest
|
||||
sweb.env.x86_64.1f92e6d7cef88badc4f744:latest
|
||||
sweb.env.x86_64.27dd9791e13f5c857a09f9:latest
|
||||
sweb.env.x86_64.297af196949a2a635bce66:latest
|
||||
sweb.env.x86_64.2baaea72acc974f6c02079:latest
|
||||
sweb.env.x86_64.2e50125951bc69cddd7421:latest
|
||||
sweb.env.x86_64.2f217c8b4490bfa0e2ba14:latest
|
||||
sweb.env.x86_64.31244378a92e3bcce809ac:latest
|
||||
sweb.env.x86_64.428468730904ff6b4232aa:latest
|
||||
sweb.env.x86_64.5d1fda9d55d65d8a4e5bdb:latest
|
||||
sweb.env.x86_64.6b007979cf533f0f3016e8:latest
|
||||
sweb.env.x86_64.7037e8c448a4b8ebfe9b13:latest
|
||||
sweb.env.x86_64.71498c7426dbf05599642f:latest
|
||||
sweb.env.x86_64.756beac07713d7e8dc1129:latest
|
||||
sweb.env.x86_64.78278ae2cf880e395f1337:latest
|
||||
sweb.env.x86_64.8f1f7b974f0c57c7aeba39:latest
|
||||
sweb.env.x86_64.934a137824256b612e9dc5:latest
|
||||
sweb.env.x86_64.a0efca7a0fe6719dbf65c2:latest
|
||||
sweb.env.x86_64.a18371b03f944585b4f08c:latest
|
||||
sweb.env.x86_64.a33dddf55cdff5d8e23374:latest
|
||||
sweb.env.x86_64.aa92880033da20ca313928:latest
|
||||
sweb.env.x86_64.b649f0ff62fad147f7f073:latest
|
||||
sweb.env.x86_64.b7ce4be3b3c35f68c61248:latest
|
||||
sweb.env.x86_64.c70909fdac4897d1c685df:latest
|
||||
sweb.env.x86_64.c795f4b88616b8462021ed:latest
|
||||
sweb.env.x86_64.cc47cc71483942d0c3a15e:latest
|
||||
sweb.env.x86_64.dc5ff4c0e3fe8db5afc4da:latest
|
||||
sweb.env.x86_64.e3afd7f04b325a4de4982d:latest
|
||||
sweb.env.x86_64.e5bb89bf78258a7d14c34b:latest
|
||||
sweb.env.x86_64.e83e37f52c09532c62acfb:latest
|
||||
sweb.env.x86_64.efa6065ed5bf204410fd53:latest
|
||||
sweb.eval.x86_64.django_s_django-17087:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10508:latest
|
||||
sweb.eval.x86_64.django_s_django-14017:latest
|
||||
sweb.eval.x86_64.django_s_django-11422:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-14774:latest
|
||||
sweb.eval.x86_64.django_s_django-14915:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-22005:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-5221:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-17022:latest
|
||||
sweb.eval.x86_64.django_s_django-15996:latest
|
||||
sweb.eval.x86_64.django_s_django-15252:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-21171:latest
|
||||
sweb.eval.x86_64.django_s_django-11797:latest
|
||||
sweb.eval.x86_64.django_s_django-16046:latest
|
||||
sweb.eval.x86_64.django_s_django-11583:latest
|
||||
sweb.eval.x86_64.django_s_django-15738:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-21612:latest
|
||||
sweb.eval.x86_64.astropy_s_astropy-12907:latest
|
||||
sweb.eval.x86_64.django_s_django-11620:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-16792:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13779:latest
|
||||
sweb.eval.x86_64.django_s_django-16041:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-13471:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-20442:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-20049:latest
|
||||
sweb.eval.x86_64.django_s_django-14411:latest
|
||||
sweb.eval.x86_64.django_s_django-13447:latest
|
||||
sweb.eval.x86_64.django_s_django-12856:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10949:latest
|
||||
sweb.eval.x86_64.django_s_django-14787:latest
|
||||
sweb.eval.x86_64.django_s_django-11815:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13584:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14087:latest
|
||||
sweb.eval.x86_64.django_s_django-15388:latest
|
||||
sweb.eval.x86_64.django_s_django-11179:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-24102:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-24213:latest
|
||||
sweb.eval.x86_64.django_s_django-15781:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-8906:latest
|
||||
sweb.eval.x86_64.django_s_django-13710:latest
|
||||
sweb.eval.x86_64.django_s_django-13925:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14092:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-7373:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-25498:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-5227:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-15678:latest
|
||||
sweb.eval.x86_64.django_s_django-13551:latest
|
||||
sweb.eval.x86_64.django_s_django-14155:latest
|
||||
sweb.eval.x86_64.django_s_django-13933:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-21055:latest
|
||||
sweb.eval.x86_64.django_s_django-13660:latest
|
||||
sweb.eval.x86_64.django_s_django-16527:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-5692:latest
|
||||
sweb.eval.x86_64.mwaskom_s_seaborn-3010:latest
|
||||
sweb.eval.x86_64.django_s_django-12700:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-11400:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-23117:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-20639:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-23262:latest
|
||||
sweb.eval.x86_64.django_s_django-15498:latest
|
||||
sweb.eval.x86_64.django_s_django-12453:latest
|
||||
sweb.eval.x86_64.django_s_django-14999:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-13480:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-21847:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-15011:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25570:latest
|
||||
sweb.eval.x86_64.sphinx-doc_s_sphinx-7975:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14983:latest
|
||||
sweb.eval.x86_64.django_s_django-14534:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-14396:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-25442:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-15535:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-22714:latest
|
||||
sweb.eval.x86_64.django_s_django-15789:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-21627:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-24066:latest
|
||||
sweb.eval.x86_64.pylint-dev_s_pylint-7993:latest
|
||||
sweb.eval.x86_64.django_s_django-14752:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-18835:latest
|
||||
sweb.eval.x86_64.django_s_django-17051:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-12171:latest
|
||||
sweb.eval.x86_64.pydata_s_xarray-3364:latest
|
||||
sweb.eval.x86_64.mwaskom_s_seaborn-3190:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-7168:latest
|
||||
sweb.eval.x86_64.django_s_django-12747:latest
|
||||
sweb.eval.x86_64.django_s_django-15695:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-22835:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-12481:latest
|
||||
sweb.eval.x86_64.django_s_django-15851:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-14024:latest
|
||||
sweb.eval.x86_64.django_s_django-14608:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-9359:latest
|
||||
sweb.eval.x86_64.django_s_django-16873:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-25433:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-13031:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-7432:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25747:latest
|
||||
sweb.eval.x86_64.django_s_django-12286:latest
|
||||
sweb.eval.x86_64.django_s_django-11910:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-12471:latest
|
||||
sweb.eval.x86_64.pylint-dev_s_pylint-5859:latest
|
||||
sweb.eval.x86_64.django_s_django-11133:latest
|
||||
sweb.eval.x86_64.astropy_s_astropy-14365:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13496:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-19487:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-13895:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-15345:latest
|
||||
sweb.eval.x86_64.django_s_django-13590:latest
|
||||
sweb.eval.x86_64.django_s_django-13757:latest
|
||||
sweb.eval.x86_64.django_s_django-16379:latest
|
||||
sweb.eval.x86_64.django_s_django-13768:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-8365:latest
|
||||
sweb.eval.x86_64.django_s_django-14580:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-20154:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-12419:latest
|
||||
sweb.eval.x86_64.django_s_django-12125:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-24152:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-15512:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-18621:latest
|
||||
sweb.eval.x86_64.pydata_s_xarray-4248:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-11040:latest
|
||||
sweb.eval.x86_64.django_s_django-11099:latest
|
||||
sweb.eval.x86_64.django_s_django-16816:latest
|
||||
sweb.eval.x86_64.django_s_django-13265:latest
|
||||
sweb.eval.x86_64.django_s_django-16139:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10297:latest
|
||||
sweb.eval.x86_64.django_s_django-14016:latest
|
||||
sweb.eval.x86_64.pallets_s_flask-5063:latest
|
||||
sweb.eval.x86_64.astropy_s_astropy-7746:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-24265:latest
|
||||
sweb.eval.x86_64.django_s_django-13448:latest
|
||||
sweb.eval.x86_64.django_s_django-12908:latest
|
||||
sweb.eval.x86_64.sphinx-doc_s_sphinx-8627:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-14317:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-6116:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-23191:latest
|
||||
sweb.eval.x86_64.pydata_s_xarray-5131:latest
|
||||
sweb.eval.x86_64.django_s_django-11019:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-23913:latest
|
||||
sweb.eval.x86_64.django_s_django-15790:latest
|
||||
sweb.eval.x86_64.django_s_django-12497:latest
|
||||
sweb.eval.x86_64.matplotlib_s_matplotlib-26020:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25638:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25500:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-19007:latest
|
||||
sweb.eval.x86_64.django_s_django-12308:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-7220:latest
|
||||
sweb.eval.x86_64.django_s_django-11848:latest
|
||||
sweb.eval.x86_64.django_s_django-15347:latest
|
||||
sweb.eval.x86_64.pytest-dev_s_pytest-7490:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-18532:latest
|
||||
sweb.eval.x86_64.django_s_django-14997:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-24909:latest
|
||||
sweb.eval.x86_64.django_s_django-13220:latest
|
||||
sweb.eval.x86_64.sympy_s_sympy-21614:latest
|
||||
sweb.eval.x86_64.django_s_django-15902:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13497:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13439:latest
|
||||
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14894:latest
|
||||
sweb.eval.x86_64.django_s_django-12983:latest
|
||||
@@ -0,0 +1,31 @@
|
||||
def print_diff_ignore_order(file1, file2):
|
||||
with open(file1, 'r') as f1, open(file2, 'r') as f2:
|
||||
file1_lines = set(f1.readlines())
|
||||
file2_lines = set(f2.readlines())
|
||||
|
||||
only_in_file1 = file1_lines - file2_lines
|
||||
only_in_file2 = file2_lines - file1_lines
|
||||
|
||||
if only_in_file1:
|
||||
print(f'Lines in {file1} but not in {file2}:')
|
||||
for line in sorted(only_in_file1):
|
||||
print(f'- {line.strip()}')
|
||||
|
||||
# if only_in_file2:
|
||||
# print(f"Lines in {file2} but not in {file1}:")
|
||||
# for line in sorted(only_in_file2):
|
||||
# print(f"+ {line.strip()}")
|
||||
|
||||
if not only_in_file1 and not only_in_file2:
|
||||
print('The files have the same content (ignoring line order).')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Usage
|
||||
lite1 = 'all-swebench-lite-instance-images.txt' # Replace with the path to your first file
|
||||
lite2 = '../../swe_bench/scripts/docker/all-swebench-lite-instance-images.txt' # Replace with the path to your second file
|
||||
print_diff_ignore_order(lite1, lite2)
|
||||
|
||||
full1 = 'all-swebench-full-instance-images.txt' # Replace with the path to your first file
|
||||
full2 = '../../swe_bench/scripts/docker/all-swebench-full-instance-images.txt' # Replace with the path to your second file
|
||||
print_diff_ignore_order(full1, full2)
|
||||
48
evaluation/benchmarks/testgeneval/scripts/docker/delete_all_images.sh
Executable file
48
evaluation/benchmarks/testgeneval/scripts/docker/delete_all_images.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
# Script will delete all repositories and tags in your Docker Hub account
|
||||
set -e
|
||||
|
||||
# Set username and password from command-line arguments
|
||||
UNAME=$1
|
||||
UPASS=$2
|
||||
|
||||
# Get token to interact with Docker Hub
|
||||
TOKEN=$(curl -s -H "Content-Type: application/json" -X POST -d '{"username": "'${UNAME}'", "password": "'${UPASS}'"}' https://hub.docker.com/v2/users/login/ | jq -r .token)
|
||||
|
||||
# Ensure token retrieval was successful
|
||||
if [[ -z "$TOKEN" ]]; then
|
||||
echo "Failed to obtain authentication token. Please check your credentials."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get list of repositories for that user account
|
||||
echo "Listing repositories in Docker Hub account '${UNAME}':"
|
||||
REPO_LIST=$(curl -s -H "Authorization: JWT ${TOKEN}" "https://hub.docker.com/v2/repositories/${UNAME}/?page_size=10000" | jq -r '.results|.[]|.name')
|
||||
if [[ -z "$REPO_LIST" ]]; then
|
||||
echo "No repositories found for user '${UNAME}' or failed to fetch repositories."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Loop through each repository and delete its tags and the repository itself
|
||||
for rep in ${REPO_LIST}; do
|
||||
echo "Processing repository: ${UNAME}/${rep}"
|
||||
|
||||
# Get all tags for the repository
|
||||
IMAGES=$(curl -s -H "Authorization: JWT ${TOKEN}" "https://hub.docker.com/v2/repositories/${UNAME}/${rep}/tags/?page_size=100")
|
||||
IMAGE_TAGS=$(echo $IMAGES | jq -r '.results|.[]|.name')
|
||||
|
||||
# Delete each tag
|
||||
for tag in ${IMAGE_TAGS}; do
|
||||
echo "Deleting tag: ${UNAME}/${rep}:${tag}"
|
||||
curl -s -X DELETE -H "Authorization: JWT ${TOKEN}" "https://hub.docker.com/v2/repositories/${UNAME}/${rep}/tags/${tag}/"
|
||||
done
|
||||
|
||||
# Delete the repository itself
|
||||
echo "Deleting repository: ${UNAME}/${rep}"
|
||||
curl -s -X DELETE -H "Authorization: JWT ${TOKEN}" "https://hub.docker.com/v2/repositories/${UNAME}/${rep}/" || {
|
||||
echo "Failed to delete repository '${UNAME}/${rep}'. Please check permissions or API limits."
|
||||
}
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "Script execution completed."
|
||||
@@ -0,0 +1,18 @@
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
def dataset_to_txt(dataset, txt_file, split='test'):
|
||||
with open(txt_file, 'w') as f:
|
||||
for datum in dataset[split]:
|
||||
instance_id = datum['instance_id'].replace('__', '_s_')
|
||||
f.write(f'sweb.eval.x86_64.{instance_id}:latest\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Load the private dataset
|
||||
dataset = load_dataset('kjain14/testgeneval')
|
||||
|
||||
dataset_lite = load_dataset('kjain14/testgenevallite')
|
||||
|
||||
dataset_to_txt(dataset_lite, 'all-swebench-lite-instance-images.txt', lite=True)
|
||||
dataset_to_txt(dataset, 'all-swebench-full-instance-images.txt')
|
||||
@@ -0,0 +1,173 @@
|
||||
import argparse
|
||||
import copy
|
||||
import difflib
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
|
||||
|
||||
def insert_line_in_string(input_string, new_str, insert_line):
|
||||
"""
|
||||
Inserts a new line into a string at the specified line number.
|
||||
|
||||
:param input_string: The original string.
|
||||
:param new_str: The string to insert.
|
||||
:param insert_line: The line number at which to insert (1-based index).
|
||||
:return: The modified string.
|
||||
"""
|
||||
file_text = input_string.expandtabs()
|
||||
new_str = new_str.expandtabs()
|
||||
|
||||
file_text_lines = file_text.split('\n')
|
||||
|
||||
new_str_lines = new_str.split('\n')
|
||||
new_file_text_lines = (
|
||||
file_text_lines[:insert_line] + new_str_lines + file_text_lines[insert_line:]
|
||||
)
|
||||
|
||||
return '\n'.join(new_file_text_lines)
|
||||
|
||||
|
||||
def print_string_diff(original, modified):
|
||||
"""
|
||||
Prints the differences between two strings line by line.
|
||||
|
||||
:param original: The original string.
|
||||
:param modified: The modified string.
|
||||
"""
|
||||
original_lines = original.splitlines(keepends=True)
|
||||
modified_lines = modified.splitlines(keepends=True)
|
||||
|
||||
diff = difflib.unified_diff(
|
||||
original_lines,
|
||||
modified_lines,
|
||||
fromfile='original',
|
||||
tofile='modified',
|
||||
lineterm='',
|
||||
)
|
||||
|
||||
print(''.join(diff))
|
||||
|
||||
|
||||
def parse_json_files(root_dir, output_dir, metadata_objs, preds_objs):
|
||||
final_output = {i: [] for i in range(25)}
|
||||
|
||||
for subdir in sorted(os.listdir(root_dir)): # Sorting ensures consistent order
|
||||
subdir_path = os.path.join(root_dir, subdir)
|
||||
# subdir_instance = subdir.rsplit('-', 1)[0]
|
||||
metadata = metadata_objs[subdir]
|
||||
orig_test_suite = metadata['test_result']['test_suite']
|
||||
|
||||
if os.path.isdir(subdir_path): # Check if it's a directory
|
||||
print(f'Processing subdirectory: {subdir}')
|
||||
|
||||
# Now loop through the JSON files in this subdirectory
|
||||
i = 0
|
||||
test_suite = preds_objs[subdir] if subdir in preds_objs else ''
|
||||
for file in sorted(
|
||||
os.listdir(subdir_path)
|
||||
): # Sorting ensures consistent order
|
||||
metadata_copy = copy.deepcopy(metadata)
|
||||
if file.endswith('.json'): # Check for JSON files
|
||||
file_path = os.path.join(subdir_path, file)
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f) # Load JSON data
|
||||
try:
|
||||
tool_calls = data['response']['choices'][0]['message'][
|
||||
'tool_calls'
|
||||
]
|
||||
if tool_calls is not None:
|
||||
for tool_call in tool_calls:
|
||||
tool_call_dict = eval(
|
||||
tool_call['function']['arguments']
|
||||
)
|
||||
|
||||
if (
|
||||
tool_call_dict is not None
|
||||
and tool_call_dict != {}
|
||||
):
|
||||
command = tool_call_dict['command']
|
||||
if command == 'create':
|
||||
test_suite = tool_call_dict['file_text']
|
||||
if (
|
||||
command != 'str_replace'
|
||||
and command != 'insert'
|
||||
and 'coverage' not in command
|
||||
):
|
||||
print(command)
|
||||
if command == 'insert':
|
||||
test_suite_new = insert_line_in_string(
|
||||
test_suite,
|
||||
tool_call_dict['new_str'],
|
||||
tool_call_dict['insert_line'],
|
||||
)
|
||||
test_suite = test_suite_new
|
||||
if command == 'str_replace':
|
||||
if (
|
||||
test_suite.count(
|
||||
tool_call_dict['old_str']
|
||||
)
|
||||
== 1
|
||||
):
|
||||
test_suite_new = test_suite.replace(
|
||||
tool_call_dict['old_str'],
|
||||
tool_call_dict['new_str'],
|
||||
)
|
||||
else:
|
||||
continue
|
||||
test_suite = test_suite_new
|
||||
except Exception:
|
||||
print(traceback.format_exc())
|
||||
continue
|
||||
|
||||
metadata_copy['test_result']['test_suite'] = test_suite
|
||||
if i < 25:
|
||||
final_output[i].append(metadata_copy)
|
||||
i += 1
|
||||
except Exception as e:
|
||||
print(traceback.format_exc())
|
||||
print(f' Error loading {file_path}: {e}')
|
||||
|
||||
for j in range(i, 24):
|
||||
final_output[j].append(metadata_copy)
|
||||
metadata_orig = copy.deepcopy(metadata)
|
||||
metadata_orig['test_result']['test_suite'] = orig_test_suite
|
||||
final_output[24].append(metadata_orig)
|
||||
|
||||
for i in range(25):
|
||||
output_file = os.path.join(output_dir, f'output_{i}.jsonl')
|
||||
with open(output_file, 'w') as f:
|
||||
for metadata in final_output[i]:
|
||||
f.write(json.dumps(metadata) + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Parse JSON file')
|
||||
parser.add_argument('--root_dir', type=str, help='Root directory', required=True)
|
||||
parser.add_argument(
|
||||
'--output_dir', type=str, help='Output directory', required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
'--starting_preds_file', type=str, help='Starting predictions', default=None
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_file = os.path.join(args.output_dir, 'output.jsonl')
|
||||
metadata_objs = {}
|
||||
with open(output_file, 'r') as f:
|
||||
content = f.readlines()
|
||||
for line in content:
|
||||
metadata = json.loads(line)
|
||||
metadata_objs[metadata['instance_id']] = metadata
|
||||
|
||||
starting_preds_file = args.starting_preds_file
|
||||
preds_objs = {}
|
||||
if starting_preds_file is not None:
|
||||
with open(starting_preds_file, 'r') as f:
|
||||
content = f.readlines()
|
||||
for line in content:
|
||||
pred = json.loads(line)
|
||||
preds_objs[pred['id']] = pred['preds']['full'][0]
|
||||
|
||||
parse_json_files(args.root_dir, args.output_dir, metadata_objs, preds_objs)
|
||||
67
evaluation/benchmarks/testgeneval/scripts/eval/compare_outputs.py
Executable file
67
evaluation/benchmarks/testgeneval/scripts/eval/compare_outputs.py
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compare two TestGenEval output JSONL files and print the resolved diff'
|
||||
)
|
||||
parser.add_argument('input_file_1', type=str)
|
||||
parser.add_argument('input_file_2', type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
|
||||
df2 = pd.read_json(args.input_file_2, orient='records', lines=True)
|
||||
|
||||
|
||||
# Get the intersection of the ids
|
||||
df = pd.merge(df1, df2, on='id', how='inner')
|
||||
|
||||
|
||||
def _get_coverage(report):
|
||||
if report is None:
|
||||
return False
|
||||
if isinstance(report, float):
|
||||
return False
|
||||
else:
|
||||
return report.get('test_pass', False)
|
||||
|
||||
|
||||
df['test_pass_x'] = df['test_pass_x'].apply(_get_coverage)
|
||||
df['test_pass_y'] = df['test_pass_y'].apply(_get_coverage)
|
||||
df['diff'] = df.apply(lambda x: x['test_pass_x'] != x['test_pass_y'], axis=1)
|
||||
|
||||
df_diff = df[df['diff']].sort_values(
|
||||
by=['test_pass_x', 'test_pass_y'], ascending=[False, False]
|
||||
)
|
||||
# skip if any of the pass is nan, which means one of the eval is not finished yet
|
||||
df_diff = df_diff[df_diff['test_pass_x'].notna() & df_diff['test_pass_y'].notna()]
|
||||
|
||||
print(f'X={args.input_file_1}')
|
||||
print(f'Y={args.input_file_2}')
|
||||
print(f'# diff={df_diff.shape[0]}')
|
||||
df_diff = df_diff[['id', 'test_pass_x', 'test_pass_y', 'report_x', 'report_y']]
|
||||
|
||||
# x pass but y not
|
||||
print('-' * 100)
|
||||
df_diff_x_only = df_diff[df_diff['test_pass_x'] & ~df_diff['test_pass_y']].sort_values(
|
||||
by='id'
|
||||
)
|
||||
print(f'# x pass but y not={df_diff_x_only.shape[0]}')
|
||||
print(df_diff_x_only[['id', 'report_x', 'report_y']])
|
||||
|
||||
# y pass but x not
|
||||
print('-' * 100)
|
||||
df_diff_y_only = df_diff[~df_diff['test_pass_x'] & df_diff['test_pass_y']].sort_values(
|
||||
by='id'
|
||||
)
|
||||
print(f'# y pass but x not={df_diff_y_only.shape[0]}')
|
||||
print(df_diff_y_only[['id', 'report_x', 'report_y']])
|
||||
# get instance_id from df_diff_y_only
|
||||
print('-' * 100)
|
||||
print('Instances that x pass but y not:')
|
||||
print(df_diff_x_only['id'].tolist())
|
||||
|
||||
print('-' * 100)
|
||||
print('Instances that y pass but x not:')
|
||||
print(df_diff_y_only['id'].tolist())
|
||||
@@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
|
||||
FOLDER_PATH=$1
|
||||
NEW_FOLDER_PATH=${FOLDER_PATH}.swebench_submission
|
||||
mkdir -p $NEW_FOLDER_PATH
|
||||
|
||||
# Build all_preds.jsonl
|
||||
poetry run python evaluation/testgeneval/scripts/eval/convert_oh_output_to_swe_json.py $FOLDER_PATH/output.jsonl
|
||||
mv $FOLDER_PATH/output.swebench.jsonl $NEW_FOLDER_PATH/all_preds.jsonl
|
||||
|
||||
# Build trajs/
|
||||
mkdir -p $NEW_FOLDER_PATH/trajs
|
||||
for instance_dir in $FOLDER_PATH/llm_completions/*/; do
|
||||
instance_id=$(basename "$instance_dir")
|
||||
latest_json=$(ls -t "$instance_dir"/*.json | head -n1)
|
||||
if [ -n "$latest_json" ]; then
|
||||
cat "$latest_json" | jq -r '.messages' > "$NEW_FOLDER_PATH/trajs/$instance_id.json"
|
||||
fi
|
||||
done
|
||||
|
||||
# Build logs/
|
||||
# check if $FOLDER_PATH/eval_outputs exists, if so copy over - else raise error
|
||||
if [ -d "$FOLDER_PATH/eval_outputs" ]; then
|
||||
cp -r $FOLDER_PATH/eval_outputs $NEW_FOLDER_PATH/logs
|
||||
else
|
||||
echo "Error: $FOLDER_PATH/eval_outputs does not exist. You should run the local docker eval_infer.sh first."
|
||||
exit 1
|
||||
fi
|
||||
91
evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_output_to_md.py
Executable file
91
evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_output_to_md.py
Executable file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert OpenHands output to a readable markdown format for visualization."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.testgeneval.eval_infer import process_test_suite
|
||||
from openhands.events.serialization import event_from_dict
|
||||
|
||||
tqdm.pandas()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('oh_output_file', type=str)
|
||||
args = parser.parse_args()
|
||||
output_md_folder = args.oh_output_file.replace('.jsonl', '.viz')
|
||||
print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
|
||||
|
||||
oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
|
||||
# model name is the folder name of oh_output_file
|
||||
model_name = os.path.basename(os.path.dirname(args.oh_output_file))
|
||||
|
||||
|
||||
def convert_history_to_str(history):
|
||||
ret = ''
|
||||
separator = '\n\n' + '-' * 100 + '\n'
|
||||
|
||||
for i, event in enumerate(history):
|
||||
if i != 0:
|
||||
ret += separator
|
||||
|
||||
if isinstance(event, list):
|
||||
# "event" is a legacy pair of (action, observation)
|
||||
event_obj = event_from_dict(event[0])
|
||||
ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
|
||||
ret += str(event_obj)
|
||||
ret += separator
|
||||
|
||||
event_obj = event_from_dict(event[1])
|
||||
ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
|
||||
ret += str(event_obj)
|
||||
else:
|
||||
# "event" is a single event
|
||||
event_obj = event_from_dict(event)
|
||||
ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
|
||||
ret += str(event_obj)
|
||||
return ret
|
||||
|
||||
|
||||
def write_row_to_md_file(row):
|
||||
if 'test_suite' in row:
|
||||
test_suite = row['test_suite']
|
||||
elif 'test_result' in row and 'test_suite' in row['test_result']:
|
||||
test_suite = row['test_result']['test_suite']
|
||||
else:
|
||||
raise ValueError(f'Row {row} does not have a test_suite')
|
||||
|
||||
if 'report' in row:
|
||||
coverage = row['report'].get('coverage', 0)
|
||||
mutation = row['report'].get('mutation_score', 0)
|
||||
else:
|
||||
coverage = None
|
||||
mutation = None
|
||||
|
||||
id = row['id']
|
||||
filename = f'{id}.md'
|
||||
os.makedirs(output_md_folder, exist_ok=True)
|
||||
filepath = os.path.join(output_md_folder, filename)
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(f'# {id} (coverage: {coverage})\n')
|
||||
f.write(f'# {id} (mutation score: {mutation})\n')
|
||||
|
||||
# MetaData
|
||||
f.write('## MetaData\n')
|
||||
f.write('```json\n')
|
||||
f.write(json.dumps(row['metadata'], indent=2))
|
||||
f.write('\n```\n')
|
||||
|
||||
# Trajectory
|
||||
f.write('## History\n')
|
||||
f.write(convert_history_to_str(row['history']))
|
||||
|
||||
f.write('## Test Suite\n')
|
||||
f.write(f'{test_suite}\n')
|
||||
|
||||
|
||||
oh_format.progress_apply(write_row_to_md_file, axis=1)
|
||||
@@ -0,0 +1,35 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from evaluation.swe_bench.eval_infer import process_git_patch
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('oh_output_file', type=str)
|
||||
args = parser.parse_args()
|
||||
output_filepath = args.oh_output_file.replace('.jsonl', '.swebench.jsonl')
|
||||
print(f'Converting {args.oh_output_file} to {output_filepath}')
|
||||
|
||||
oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
|
||||
# model name is the folder name of oh_output_file
|
||||
model_name = os.path.basename(os.path.dirname(args.oh_output_file))
|
||||
|
||||
|
||||
def convert_row_to_swebench_format(row):
|
||||
if 'git_patch' in row:
|
||||
model_patch = row['git_patch']
|
||||
elif 'test_result' in row and 'git_patch' in row['test_result']:
|
||||
model_patch = row['test_result']['git_patch']
|
||||
else:
|
||||
raise ValueError(f'Row {row} does not have a git_patch')
|
||||
|
||||
return {
|
||||
'instance_id': row['instance_id'],
|
||||
'model_patch': process_git_patch(model_patch),
|
||||
'model_name_or_path': model_name,
|
||||
}
|
||||
|
||||
|
||||
swebench_format = oh_format.apply(convert_row_to_swebench_format, axis=1)
|
||||
swebench_format.to_json(output_filepath, lines=True, orient='records')
|
||||
@@ -0,0 +1,27 @@
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('output_filepath', type=str, help='Path to save the output file')
|
||||
parser.add_argument(
|
||||
'--dataset_name',
|
||||
type=str,
|
||||
help='Name of the dataset to download',
|
||||
default='kjain14/testgeneval',
|
||||
)
|
||||
parser.add_argument('--split', type=str, help='Split to download', default='test')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset = load_dataset(args.dataset_name, split=args.split)
|
||||
output_filepath = args.output_filepath
|
||||
print(
|
||||
f'Downloading gold test suites from {args.dataset_name} (split: {args.split}) to {output_filepath}'
|
||||
)
|
||||
test_suites = [
|
||||
{'instance_id': row['instance_id'], 'test_suite': row['test_src']} for row in dataset
|
||||
]
|
||||
print(f'{len(test_suites)} test suites loaded')
|
||||
pd.DataFrame(test_suites).to_json(output_filepath, lines=True, orient='records')
|
||||
print(f'Test suites saved to {output_filepath}')
|
||||
122
evaluation/benchmarks/testgeneval/scripts/eval/summarize_outputs.py
Executable file
122
evaluation/benchmarks/testgeneval/scripts/eval/summarize_outputs.py
Executable file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter
|
||||
|
||||
from openhands.events.serialization import event_from_dict
|
||||
from openhands.events.utils import get_pairs_from_events
|
||||
|
||||
ERROR_KEYWORDS = [
|
||||
'Agent encountered an error while processing the last action',
|
||||
'APIError',
|
||||
'Action execution failed',
|
||||
]
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('output_file', type=str, help='The file to summarize')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.output_file, 'r') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
num_lines = len(lines)
|
||||
num_error_lines = 0
|
||||
num_agent_stuck_in_loop = 0
|
||||
|
||||
coverage = 0
|
||||
mutation_score = 0
|
||||
num_empty_suite = 0
|
||||
|
||||
error_counter = Counter()
|
||||
|
||||
main_agent_cost = []
|
||||
editor_cost = []
|
||||
num_turns = []
|
||||
|
||||
for line in lines:
|
||||
_d = json.loads(line)
|
||||
|
||||
# Cost
|
||||
costs = _d['metrics'].get('costs', [])
|
||||
_cur_main_agent_cost = 0
|
||||
_cur_editor_cost = 0
|
||||
for cost in costs:
|
||||
if isinstance(cost, float):
|
||||
# backward compatible
|
||||
_cur_main_agent_cost += cost
|
||||
else:
|
||||
if 'draft_editor' in cost['model']:
|
||||
_cur_editor_cost += cost['cost']
|
||||
else:
|
||||
_cur_main_agent_cost += cost['cost']
|
||||
|
||||
main_agent_cost.append(_cur_main_agent_cost)
|
||||
editor_cost.append(_cur_editor_cost)
|
||||
|
||||
# Turn status
|
||||
history = _d.get('history', [])
|
||||
events = [event_from_dict(event) for event in history]
|
||||
pairs = get_pairs_from_events(events)
|
||||
num_turns.append(len(pairs))
|
||||
|
||||
# Suite & resolve status
|
||||
suite = _d.get('test_result', {}).get('test_suite', '')
|
||||
if suite == '':
|
||||
num_empty_suite += 1
|
||||
continue
|
||||
|
||||
report = _d.get('report', {}) or {}
|
||||
coverage += report.get('coverage', 0)
|
||||
mutation_score += report.get('mutation_score', 0)
|
||||
|
||||
# Error
|
||||
error = _d.get('error', None)
|
||||
|
||||
if error is not None and isinstance(error, str):
|
||||
agent_stuck_in_loop = 'Agent got stuck in a loop' in error
|
||||
contains_error = bool(error) and not agent_stuck_in_loop
|
||||
if agent_stuck_in_loop:
|
||||
error_counter['Agent got stuck in a loop'] += 1
|
||||
num_agent_stuck_in_loop += 1
|
||||
elif contains_error:
|
||||
error_counter[error] += 1
|
||||
continue
|
||||
|
||||
for keyword in ERROR_KEYWORDS:
|
||||
if keyword in line:
|
||||
error_counter[keyword] += 1
|
||||
num_error_lines += 1
|
||||
break
|
||||
|
||||
# print the error counter (with percentage)
|
||||
print(
|
||||
f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'Average mutation score for {num_lines} ({mutation_score / num_lines * 100:.2f}%)'
|
||||
)
|
||||
|
||||
print(
|
||||
f'Number of empty suite: {num_empty_suite} / {num_lines} ({num_empty_suite / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
|
||||
)
|
||||
assert len(num_turns) == num_lines
|
||||
assert len(main_agent_cost) == num_lines
|
||||
assert len(editor_cost) == num_lines
|
||||
print('## Statistics')
|
||||
print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
|
||||
print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
|
||||
print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
|
||||
print(
|
||||
f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
|
||||
)
|
||||
|
||||
print('## Detailed error breakdown:')
|
||||
for error, count in error_counter.items():
|
||||
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
|
||||
53
evaluation/benchmarks/testgeneval/scripts/eval_infer.sh
Executable file
53
evaluation/benchmarks/testgeneval/scripts/eval_infer.sh
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
INPUT_FILE=$1
|
||||
NUM_WORKERS=$2
|
||||
DATASET=$3
|
||||
SPLIT=$4
|
||||
SKIP_MUTATION=$5
|
||||
|
||||
if [ -z "$INPUT_FILE" ]; then
|
||||
echo "INPUT_FILE not specified (should be a path to a jsonl file)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "DATASET not specified, use default kjain14/testgenevallite"
|
||||
DATASET="kjain14/testgenevallite"
|
||||
fi
|
||||
|
||||
if [ -z "$SPLIT" ]; then
|
||||
echo "SPLIT not specified, use default test"
|
||||
SPLIT="test"
|
||||
fi
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
echo "NUM_WORKERS not specified, use default 1"
|
||||
NUM_WORKERS=1
|
||||
fi
|
||||
|
||||
echo "... Evaluating on $INPUT_FILE ..."
|
||||
|
||||
COMMAND="poetry run python evaluation/benchmarks/testgeneval/eval_infer.py \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--input-file $INPUT_FILE \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ "$SKIP_MUTATION" == "true" ]; then
|
||||
echo "Skipping mutation evaluation"
|
||||
COMMAND="$COMMAND --skip_mutation"
|
||||
fi
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
echo $COMMAND
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
|
||||
# update the output with evaluation results
|
||||
# poetry run python evaluation/benchmarks/testgeneval/scripts/eval/update_output_with_eval.py $INPUT_FILE
|
||||
122
evaluation/benchmarks/testgeneval/scripts/run_infer.sh
Executable file
122
evaluation/benchmarks/testgeneval/scripts/run_infer.sh
Executable file
@@ -0,0 +1,122 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
EVAL_LIMIT=$4
|
||||
MAX_ITER=$5
|
||||
NUM_WORKERS=$6
|
||||
DATASET=$7
|
||||
SPLIT=$8
|
||||
N_RUNS=$9
|
||||
ZERO_SHOT_PATH=${10} # New argument for zero-shot path
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
echo "Number of workers not specified, use default $NUM_WORKERS"
|
||||
fi
|
||||
checkout_eval_branch
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
if [ -z "$MAX_ITER" ]; then
|
||||
echo "MAX_ITER not specified, use default 100"
|
||||
MAX_ITER=100
|
||||
fi
|
||||
|
||||
if [ -z "$USE_INSTANCE_IMAGE" ]; then
|
||||
echo "USE_INSTANCE_IMAGE not specified, use default true"
|
||||
USE_INSTANCE_IMAGE=true
|
||||
fi
|
||||
|
||||
if [ -z "$RUN_WITH_BROWSING" ]; then
|
||||
echo "RUN_WITH_BROWSING not specified, use default false"
|
||||
RUN_WITH_BROWSING=false
|
||||
fi
|
||||
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
|
||||
DATASET="princeton-nlp/SWE-bench_Lite"
|
||||
fi
|
||||
|
||||
if [ -z "$SPLIT" ]; then
|
||||
echo "SPLIT not specified, use default test"
|
||||
SPLIT="test"
|
||||
fi
|
||||
|
||||
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
|
||||
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_openhands_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
echo "DATASET: $DATASET"
|
||||
echo "SPLIT: $SPLIT"
|
||||
|
||||
# Default to NOT use Hint
|
||||
if [ -z "$USE_HINT_TEXT" ]; then
|
||||
export USE_HINT_TEXT=false
|
||||
fi
|
||||
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
||||
EVAL_NOTE="$OPENHANDS_VERSION"
|
||||
# if not using Hint, add -no-hint to the eval note
|
||||
if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
fi
|
||||
|
||||
if [ "$RUN_WITH_BROWSING" = true ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-with-browsing"
|
||||
fi
|
||||
|
||||
if [ -n "$EXP_NAME" ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
|
||||
fi
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/benchmarks/testgeneval/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $eval_note \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
if [ -n "$ZERO_SHOT_PATH" ]; then
|
||||
echo "ZERO_SHOT_PATH: $ZERO_SHOT_PATH"
|
||||
COMMAND="$COMMAND --testfile_start --zero_shot_path $ZERO_SHOT_PATH"
|
||||
fi
|
||||
|
||||
eval $COMMAND
|
||||
}
|
||||
|
||||
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
|
||||
if [ -z "$N_RUNS" ]; then
|
||||
N_RUNS=1
|
||||
echo "N_RUNS not specified, use default $N_RUNS"
|
||||
fi
|
||||
|
||||
for i in $(seq 1 $N_RUNS); do
|
||||
current_eval_note="$EVAL_NOTE-run_$i"
|
||||
echo "EVAL_NOTE: $current_eval_note"
|
||||
run_eval $current_eval_note
|
||||
done
|
||||
|
||||
checkout_original_branch
|
||||
40
evaluation/benchmarks/testgeneval/scripts/setup/instance_swe_entry.sh
Executable file
40
evaluation/benchmarks/testgeneval/scripts/setup/instance_swe_entry.sh
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
source ~/.bashrc
|
||||
SWEUTIL_DIR=/swe_util
|
||||
|
||||
# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
|
||||
# SWE_INSTANCE_ID=django__django-11099
|
||||
if [ -z "$SWE_INSTANCE_ID" ]; then
|
||||
echo "Error: SWE_INSTANCE_ID is not set." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
|
||||
item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
|
||||
|
||||
if [[ -z "$item" ]]; then
|
||||
echo "No item found for the provided instance ID."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
|
||||
|
||||
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
|
||||
|
||||
# Clear the workspace
|
||||
if [ -d /workspace ]; then
|
||||
rm -rf /workspace/*
|
||||
else
|
||||
mkdir /workspace
|
||||
fi
|
||||
# Copy repo to workspace
|
||||
if [ -d /workspace/$WORKSPACE_NAME ]; then
|
||||
rm -rf /workspace/$WORKSPACE_NAME
|
||||
fi
|
||||
mkdir -p /workspace
|
||||
ln -s /testbed /workspace/$WORKSPACE_NAME
|
||||
|
||||
# Activate instance-specific environment
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate testbed
|
||||
27
evaluation/benchmarks/testgeneval/scripts/setup/prepare_swe_utils.sh
Executable file
27
evaluation/benchmarks/testgeneval/scripts/setup/prepare_swe_utils.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
EVAL_WORKSPACE="evaluation/swe_bench/eval_workspace"
|
||||
mkdir -p $EVAL_WORKSPACE
|
||||
|
||||
# 1. Prepare REPO
|
||||
echo "==== Prepare SWE-bench repo ===="
|
||||
OH_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/SWE-bench.git"
|
||||
OH_SWE_BENCH_REPO_BRANCH="eval"
|
||||
git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
|
||||
|
||||
# 2. Prepare DATA
|
||||
echo "==== Prepare SWE-bench data ===="
|
||||
EVAL_IMAGE=ghcr.io/all-hands-ai/eval-swe-bench:builder_with_conda
|
||||
EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
|
||||
chmod +x $EVAL_WORKSPACE/OH-SWE-bench/swebench/harness/prepare_data.sh
|
||||
if [ -d $EVAL_WORKSPACE/eval_data ]; then
|
||||
rm -r $EVAL_WORKSPACE/eval_data
|
||||
fi
|
||||
docker run \
|
||||
-v $EVAL_WORKSPACE:/workspace \
|
||||
-w /workspace \
|
||||
-u $(id -u):$(id -g) \
|
||||
-e HF_DATASETS_CACHE="/tmp" \
|
||||
--rm -it $EVAL_IMAGE \
|
||||
bash -c "cd OH-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
|
||||
96
evaluation/benchmarks/testgeneval/scripts/setup/swe_entry.sh
Executable file
96
evaluation/benchmarks/testgeneval/scripts/setup/swe_entry.sh
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# assert user name is `root`
|
||||
if [ "$USER" != "root" ]; then
|
||||
echo "Error: This script is intended to be run by the 'root' user only." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source ~/.bashrc
|
||||
|
||||
SWEUTIL_DIR=/swe_util
|
||||
|
||||
# Create logs directory
|
||||
LOG_DIR=/openhands/logs
|
||||
mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
|
||||
|
||||
# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
|
||||
# SWE_INSTANCE_ID=django__django-11099
|
||||
if [ -z "$SWE_INSTANCE_ID" ]; then
|
||||
echo "Error: SWE_INSTANCE_ID is not set." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
|
||||
item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-test-lite.json)
|
||||
|
||||
if [[ -z "$item" ]]; then
|
||||
echo "No item found for the provided instance ID."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CONDA_ENV_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
|
||||
|
||||
echo "CONDA_ENV_NAME: $CONDA_ENV_NAME"
|
||||
|
||||
SWE_TASK_DIR=/openhands/swe_tasks
|
||||
mkdir -p $SWE_TASK_DIR
|
||||
# Dump test_patch to /workspace/test.patch
|
||||
echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
|
||||
# Dump patch to /workspace/gold.patch
|
||||
echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
|
||||
# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
|
||||
echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
|
||||
|
||||
# Clear the workspace
|
||||
rm -rf /workspace/*
|
||||
# Copy repo to workspace
|
||||
if [ -d /workspace/$CONDA_ENV_NAME ]; then
|
||||
rm -rf /workspace/$CONDA_ENV_NAME
|
||||
fi
|
||||
cp -r $SWEUTIL_DIR/eval_data/testbeds/$CONDA_ENV_NAME /workspace
|
||||
|
||||
# Reset swe-bench testbed and install the repo
|
||||
. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
|
||||
conda config --set changeps1 False
|
||||
conda config --append channels conda-forge
|
||||
conda activate swe-bench-eval
|
||||
|
||||
mkdir -p $SWE_TASK_DIR/reset_testbed_temp
|
||||
mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
|
||||
SWE_BENCH_DIR=/swe_util/OH-SWE-bench
|
||||
output=$(
|
||||
export PYTHONPATH=$SWE_BENCH_DIR && \
|
||||
cd $SWE_BENCH_DIR && \
|
||||
python swebench/harness/reset_swe_env.py \
|
||||
--swe_bench_tasks $SWEUTIL_DIR/eval_data/instances/swe-bench-test.json \
|
||||
--temp_dir $SWE_TASK_DIR/reset_testbed_temp \
|
||||
--testbed /workspace \
|
||||
--conda_path $SWEUTIL_DIR/miniforge3 \
|
||||
--instance_id $SWE_INSTANCE_ID \
|
||||
--log_dir $SWE_TASK_DIR/reset_testbed_log_dir \
|
||||
--timeout 900 \
|
||||
--verbose
|
||||
)
|
||||
|
||||
REPO_PATH=$(echo "$output" | awk -F': ' '/repo_path:/ {print $2}')
|
||||
TEST_CMD=$(echo "$output" | awk -F': ' '/test_cmd:/ {print $2}')
|
||||
echo "Repo Path: $REPO_PATH"
|
||||
echo "Test Command: $TEST_CMD"
|
||||
|
||||
echo "export SWE_BENCH_DIR=\"$SWE_BENCH_DIR\"" >> ~/.bashrc
|
||||
echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
|
||||
echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
|
||||
|
||||
if [[ "$REPO_PATH" == "None" ]]; then
|
||||
echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Activate instance-specific environment
|
||||
. $SWEUTIL_DIR/miniforge3/etc/profile.d/conda.sh
|
||||
conda activate $CONDA_ENV_NAME
|
||||
|
||||
set +e
|
||||
327
evaluation/benchmarks/testgeneval/test_filter.py
Normal file
327
evaluation/benchmarks/testgeneval/test_filter.py
Normal file
@@ -0,0 +1,327 @@
|
||||
import ast
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
from evaluation.benchmarks.testgeneval.constants import TestStatus
|
||||
from evaluation.benchmarks.testgeneval.log_parsers import (
|
||||
MAP_REPO_TO_PARSER,
|
||||
parse_log_pytest,
|
||||
)
|
||||
|
||||
|
||||
def indent_text(text, indent_level):
|
||||
return '\n'.join(
|
||||
' ' * indent_level + line if line.strip() else line for line in text.split('\n')
|
||||
)
|
||||
|
||||
|
||||
def extract_preamble_classes_and_functions(code):
|
||||
class_pattern = re.compile(
|
||||
r'(?P<decorators>(?:^@[^\r\n]*(?:\r?\n(?:[ \t]+[^\r\n]*|^\)[^\r\n]*)*)*\r?\n)*?)'
|
||||
r'^class\s+([\w]+)(?:\([^)]*\))?:', # the class line
|
||||
re.MULTILINE,
|
||||
)
|
||||
# Capture methods with or without decorators
|
||||
method_pattern = re.compile(r'(^(\s*@.*\s*)*^\s*def\s+[\w_]+\(.*\):)', re.MULTILINE)
|
||||
|
||||
# Capture functions with or without decorators
|
||||
function_pattern = re.compile(
|
||||
r'(?P<decorators>(?:^@[^\r\n]*(?:\r?\n(?:[ \t]+[^\r\n]*|^\)[^\r\n]*)*)*\r?\n)*?)'
|
||||
r'^def\s+([\w_]+)\(.*\):', # the function line
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
preamble = ''
|
||||
classes = []
|
||||
test_functions = []
|
||||
|
||||
current_position = 0
|
||||
|
||||
def extract_class_body(code: str, start_index: int) -> Tuple[str, int]:
|
||||
"""
|
||||
Extracts the body of a class from the given code starting from the specified index.
|
||||
Returns the class body and the end index of the class body.
|
||||
"""
|
||||
if not code or start_index < 0 or start_index >= len(code):
|
||||
raise ValueError('Invalid code or start index')
|
||||
|
||||
# Split the code into lines
|
||||
lines = code[start_index:].split('\n')
|
||||
class_body_lines = []
|
||||
|
||||
# Find the starting indentation level of the class definition
|
||||
class_start_line = lines[0]
|
||||
start_indent = len(class_start_line) - len(class_start_line.lstrip())
|
||||
|
||||
inside_multiline_comment = False
|
||||
end_index = start_index
|
||||
for i, line in enumerate(lines[1:], start=1):
|
||||
stripped_line = line.strip()
|
||||
current_indent = len(line) - len(line.lstrip())
|
||||
|
||||
# Handle multiline comments or docstrings
|
||||
if stripped_line.startswith('"""') or stripped_line.startswith("'''"):
|
||||
if inside_multiline_comment:
|
||||
inside_multiline_comment = False
|
||||
else:
|
||||
inside_multiline_comment = True
|
||||
|
||||
if not inside_multiline_comment:
|
||||
# Stop when we reach a line with less indentation than the class definition
|
||||
if current_indent <= start_indent and stripped_line:
|
||||
break
|
||||
|
||||
# Add lines that are part of the class body
|
||||
class_body_lines.append(line)
|
||||
# Update the end index to the current line end
|
||||
end_index = start_index + len('\n'.join(lines[: i + 1])) + 1
|
||||
|
||||
return code[start_index:end_index], end_index
|
||||
|
||||
while current_position < len(code):
|
||||
class_match = class_pattern.search(code, current_position)
|
||||
method_match = method_pattern.search(code, current_position)
|
||||
|
||||
if class_match and (
|
||||
not method_match or class_match.start() < method_match.start()
|
||||
):
|
||||
class_name = class_match.group(0)
|
||||
class_body, end_idx = extract_class_body(code, class_match.end())
|
||||
current_position = end_idx
|
||||
|
||||
methods = []
|
||||
class_prefix = class_name
|
||||
set_prefix = False
|
||||
for method_match in method_pattern.finditer(class_body):
|
||||
method_name = method_match.group()
|
||||
method_start = method_match.start()
|
||||
if not set_prefix:
|
||||
class_prefix = class_name + class_body[:method_start]
|
||||
set_prefix = True
|
||||
next_method = method_pattern.search(
|
||||
class_body, method_start + len(method_name)
|
||||
)
|
||||
method_body = (
|
||||
class_body[method_start : next_method.start()]
|
||||
if next_method
|
||||
else class_body[method_start:]
|
||||
)
|
||||
methods.append((method_name, method_body))
|
||||
|
||||
classes.append((class_prefix, methods, class_match.start()))
|
||||
|
||||
elif method_match:
|
||||
function_name = method_match.group(0)
|
||||
start_idx = method_match.start()
|
||||
|
||||
# Extract the current function's indentation level
|
||||
lines = code[start_idx:].split('\n')
|
||||
current_indent = len(lines[0]) - len(lines[0].lstrip())
|
||||
|
||||
next_function = function_pattern.search(
|
||||
code, start_idx + len(function_name)
|
||||
)
|
||||
while next_function and (
|
||||
class_match is None or next_function.start() < class_match.start()
|
||||
):
|
||||
# Calculate the indentation of the next function
|
||||
next_function_start = next_function.start()
|
||||
next_line = code[next_function_start:].split('\n', 1)[0]
|
||||
next_indent = len(next_line) - len(next_line.lstrip())
|
||||
|
||||
# Check if the next function is top-level
|
||||
if next_indent <= current_indent:
|
||||
break
|
||||
|
||||
# Continue searching for the next top-level function
|
||||
next_function = function_pattern.search(
|
||||
code, next_function.start() + len(next_function.group(0))
|
||||
)
|
||||
|
||||
if next_function:
|
||||
next_function_start = next_function.start()
|
||||
if class_match and next_function_start > class_match.start():
|
||||
next_function_start = class_match.start()
|
||||
function_body = code[start_idx:next_function_start]
|
||||
else:
|
||||
function_body = code[start_idx:]
|
||||
|
||||
test_functions.append((function_body, start_idx))
|
||||
current_position = start_idx + len(function_body)
|
||||
|
||||
else:
|
||||
break
|
||||
|
||||
if classes and test_functions:
|
||||
preamble = code[: min(classes[0][2], test_functions[0][1])]
|
||||
else:
|
||||
preamble = (
|
||||
code[: classes[0][2]]
|
||||
if classes
|
||||
else code[: test_functions[0][1]]
|
||||
if test_functions
|
||||
else code
|
||||
)
|
||||
|
||||
return preamble.strip(), classes, test_functions
|
||||
|
||||
|
||||
def filter_passing_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> Tuple[str, List[str], List[str]]:
|
||||
"""
|
||||
Filter tests based on their execution results.
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Modified test content with only passing tests
|
||||
- List of passing test names
|
||||
- List of failing test names
|
||||
"""
|
||||
# Parse test results using appropriate parser
|
||||
parser = MAP_REPO_TO_PARSER.get(repo, parse_log_pytest)
|
||||
test_results = parser(test_output)
|
||||
# Get passing and failing tests
|
||||
passing_tests = []
|
||||
failing_tests = []
|
||||
for test_name, status in test_results.items():
|
||||
if status == TestStatus.PASSED.value:
|
||||
passing_tests.append(test_name)
|
||||
else:
|
||||
failing_tests.append(test_name)
|
||||
|
||||
if not passing_tests:
|
||||
return '', passing_tests, failing_tests
|
||||
|
||||
# Extract test components
|
||||
preamble, classes, functions = extract_preamble_classes_and_functions(test_content)
|
||||
|
||||
# Filter classes to only include passing methods
|
||||
filtered_classes = []
|
||||
for class_name, methods, start_idx in classes:
|
||||
non_fail_methods = []
|
||||
for method_name, method_body in methods:
|
||||
# Extract the base method name for matching
|
||||
method_full_name = (
|
||||
method_name.split('.')[-1].split('(')[0].strip().split(' ')[-1]
|
||||
)
|
||||
# Check if the method name is in failing_tests or if any failing_test is in the method name
|
||||
if not (
|
||||
any(method_full_name in failing_test for failing_test in failing_tests)
|
||||
or any(
|
||||
failing_test in method_full_name for failing_test in failing_tests
|
||||
)
|
||||
):
|
||||
non_fail_methods.append((method_name, method_body))
|
||||
|
||||
if non_fail_methods:
|
||||
filtered_classes.append((class_name, non_fail_methods, start_idx))
|
||||
|
||||
# Filter standalone functions
|
||||
filtered_functions = []
|
||||
for func_body, start_idx in functions:
|
||||
func_name = func_body.split('def ')[1].split('(')[0].strip()
|
||||
if any(func_name in failing_test for failing_test in failing_tests) or any(
|
||||
failing_test in func_name for failing_test in failing_tests
|
||||
):
|
||||
continue
|
||||
|
||||
filtered_functions.append((func_body, start_idx))
|
||||
|
||||
# Reconstruct test content with only passing tests
|
||||
content_parts = [preamble]
|
||||
|
||||
# Add filtered classes
|
||||
for class_name, methods, _ in filtered_classes:
|
||||
class_content = class_name + '\n'
|
||||
for _, method_body in methods:
|
||||
class_content += method_body + '\n'
|
||||
content_parts.append(class_content)
|
||||
|
||||
# Add filtered functions
|
||||
for func_body, _ in filtered_functions:
|
||||
content_parts.append(func_body)
|
||||
|
||||
return '\n\n'.join(content_parts), passing_tests, failing_tests
|
||||
|
||||
|
||||
def filter_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> Tuple[str, List[str], List[str]]:
|
||||
"""
|
||||
Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
|
||||
are preserved.
|
||||
|
||||
If AST processing fails (for example, because the test file cannot be parsed),
|
||||
this function falls back on the existing regex-based filtering (filter_passing_tests).
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Modified test content (as a string) containing only passing tests.
|
||||
- List of passing test names.
|
||||
- List of failing test names.
|
||||
"""
|
||||
try:
|
||||
# Attempt to parse the test file using the AST.
|
||||
tree = ast.parse(test_content)
|
||||
|
||||
# Parse test results using the appropriate parser.
|
||||
parser = MAP_REPO_TO_PARSER.get(repo, parse_log_pytest)
|
||||
test_results = parser(test_output)
|
||||
passing_tests = [
|
||||
name
|
||||
for name, status in test_results.items()
|
||||
if status == TestStatus.PASSED.value
|
||||
]
|
||||
failing_tests = [
|
||||
name
|
||||
for name, status in test_results.items()
|
||||
if status != TestStatus.PASSED.value
|
||||
]
|
||||
|
||||
# Helper function to decide if a test name should be considered failing.
|
||||
def is_failing(name: str) -> bool:
|
||||
for ft in failing_tests:
|
||||
if name in ft or ft in name:
|
||||
return True
|
||||
return False
|
||||
|
||||
new_body = []
|
||||
for node in tree.body:
|
||||
# For top-level function definitions, only filter those that look like tests.
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
if node.name.startswith('test') and is_failing(node.name):
|
||||
continue
|
||||
new_body.append(node)
|
||||
# For classes, filter out failing test methods but preserve other methods (e.g. setup).
|
||||
elif isinstance(node, ast.ClassDef):
|
||||
new_class_body = []
|
||||
for subnode in node.body:
|
||||
if isinstance(subnode, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
# Only consider filtering if the method is a test.
|
||||
qualified_name = f'{node.name}.{subnode.name}'
|
||||
if is_failing(subnode.name) or is_failing(qualified_name):
|
||||
continue
|
||||
new_class_body.append(subnode)
|
||||
else:
|
||||
new_class_body.append(subnode)
|
||||
# Always include the class even if no test methods remain, as it might contain
|
||||
# setup, teardown, or other necessary logic.
|
||||
if new_class_body:
|
||||
node.body = new_class_body
|
||||
new_body.append(node)
|
||||
|
||||
else:
|
||||
new_body.append(node)
|
||||
|
||||
tree.body = new_body
|
||||
|
||||
# Reconstruct the source code from the filtered AST.
|
||||
# (Requires Python 3.9+ for ast.unparse; otherwise an exception will trigger the fallback.)
|
||||
new_test_content = ast.unparse(tree)
|
||||
return new_test_content, passing_tests, failing_tests
|
||||
|
||||
except Exception:
|
||||
print('AST processing failed; falling back on regex-based filtering.')
|
||||
# If AST processing fails for any reason, fall back on the original regex-based filtering.
|
||||
return filter_passing_tests(test_content, test_output, repo)
|
||||
166
evaluation/benchmarks/testgeneval/test_spec.py
Normal file
166
evaluation/benchmarks/testgeneval/test_spec.py
Normal file
@@ -0,0 +1,166 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from evaluation.benchmarks.testgeneval.constants import (
|
||||
COVERAGE_PREFIX,
|
||||
KEY_INSTANCE_ID,
|
||||
MAP_REPO_VERSION_TO_SPECS,
|
||||
TESTS_FAILED,
|
||||
TESTS_SUFFIX,
|
||||
UPDATE_TOX,
|
||||
TestGenEvalInstance,
|
||||
)
|
||||
from evaluation.benchmarks.testgeneval.utils import (
|
||||
get_test_directives,
|
||||
)
|
||||
|
||||
DIFF_MODIFIED_FILE_REGEX = r'--- a/(.*)'
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestSpec:
|
||||
"""
|
||||
A dataclass that represents a test specification for a single instance of SWE-bench.
|
||||
"""
|
||||
|
||||
instance_id: str
|
||||
id: str
|
||||
repo: str
|
||||
version: str
|
||||
test_cmd: str
|
||||
code_file: str
|
||||
test_file: str
|
||||
baseline_covs: dict
|
||||
local_imports: list[str]
|
||||
test_script_list: list[str]
|
||||
mutation_script_list: list[str]
|
||||
|
||||
@property
|
||||
def test_script(self):
|
||||
return (
|
||||
'\n'.join(['#!/bin/bash', 'set -uo pipefail'] + self.test_script_list)
|
||||
+ '\n'
|
||||
)
|
||||
# Don't exit early because we need to revert tests at the end
|
||||
|
||||
@property
|
||||
def mutation_script(self):
|
||||
return (
|
||||
'\n'.join(['#!/bin/bash', 'set -uo pipefail'] + self.mutation_script_list)
|
||||
+ '\n'
|
||||
)
|
||||
# Don't exit early because we need to revert tests at the end
|
||||
|
||||
|
||||
def make_test_setup(specs, env_name, repo_directory, includes_tox=False):
|
||||
eval_commands = []
|
||||
|
||||
if includes_tox:
|
||||
eval_commands.append(UPDATE_TOX)
|
||||
|
||||
eval_commands += [
|
||||
'source /opt/miniconda3/bin/activate',
|
||||
f'conda activate {env_name}',
|
||||
f'cd {repo_directory}',
|
||||
]
|
||||
if 'eval_commands' in specs:
|
||||
eval_commands += specs['eval_commands']
|
||||
eval_commands += [
|
||||
f'git config --global --add safe.directory {repo_directory}', # for nonroot user
|
||||
f'cd {repo_directory}',
|
||||
# This is just informational, so we have a record
|
||||
'git status',
|
||||
'git show',
|
||||
'source /opt/miniconda3/bin/activate',
|
||||
f'conda activate {env_name}',
|
||||
]
|
||||
if 'install' in specs:
|
||||
eval_commands.append(specs['install'])
|
||||
|
||||
if includes_tox:
|
||||
eval_commands.append('add_coverage_tox "tox.ini"')
|
||||
|
||||
eval_commands.append('[ -f ".coveragerc" ] && rm ".coveragerc"')
|
||||
return eval_commands
|
||||
|
||||
|
||||
def make_test_script_list(test_cmd, specs, env_name, repo_directory):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
includes_tox = 'tox' in test_cmd
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory, includes_tox)
|
||||
eval_commands += [
|
||||
f'{test_cmd} || {{ echo "{TESTS_FAILED}\n{TESTS_SUFFIX}\n" && exit 1; }}',
|
||||
f'echo "{TESTS_SUFFIX}"\n',
|
||||
'coverage json -o coverage.json',
|
||||
f'echo "{COVERAGE_PREFIX}"\n',
|
||||
'cat coverage.json',
|
||||
]
|
||||
|
||||
return eval_commands
|
||||
|
||||
|
||||
def make_mutation_script_list(specs, env_name, repo_directory, mutation_timeout):
|
||||
"""
|
||||
Runs the tests.
|
||||
"""
|
||||
|
||||
eval_commands = make_test_setup(specs, env_name, repo_directory)
|
||||
eval_commands += [
|
||||
'cosmic-ray init mutation.toml mutation.sqlite',
|
||||
f'timeout {mutation_timeout}s cosmic-ray exec mutation.toml mutation.sqlite',
|
||||
'cr-report mutation.sqlite',
|
||||
'cr-rate mutation.sqlite --estimate --confidence 95.0',
|
||||
]
|
||||
return eval_commands
|
||||
|
||||
|
||||
def make_test_spec(
|
||||
instance: TestGenEvalInstance, mutation_timeout: int, buffer: int
|
||||
) -> TestSpec:
|
||||
if isinstance(instance, TestSpec):
|
||||
return instance
|
||||
instance_id = instance[KEY_INSTANCE_ID]
|
||||
id = instance['id']
|
||||
repo = instance['repo']
|
||||
version = instance['version']
|
||||
baseline_covs = instance['baseline_covs']
|
||||
code_file = instance['code_file']
|
||||
test_file = instance['test_file']
|
||||
local_imports = instance['local_imports']
|
||||
|
||||
env_name = 'testbed'
|
||||
repo_directory = f'/{env_name}'
|
||||
specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
|
||||
|
||||
test_cmd = ' '.join(
|
||||
[
|
||||
MAP_REPO_VERSION_TO_SPECS[instance['repo']][instance['version']][
|
||||
'test_cmd'
|
||||
],
|
||||
*get_test_directives(instance),
|
||||
]
|
||||
)
|
||||
|
||||
test_script_list = make_test_script_list(test_cmd, specs, env_name, repo_directory)
|
||||
|
||||
mutation_script_list = make_mutation_script_list(
|
||||
specs, env_name, repo_directory, mutation_timeout - buffer
|
||||
)
|
||||
|
||||
return TestSpec(
|
||||
instance_id=instance_id,
|
||||
id=id,
|
||||
repo=repo,
|
||||
test_script_list=test_script_list,
|
||||
test_cmd=test_cmd,
|
||||
local_imports=local_imports,
|
||||
mutation_script_list=mutation_script_list,
|
||||
code_file=code_file,
|
||||
test_file=test_file,
|
||||
baseline_covs=baseline_covs,
|
||||
version=version,
|
||||
)
|
||||
73
evaluation/benchmarks/testgeneval/utils.py
Normal file
73
evaluation/benchmarks/testgeneval/utils.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from evaluation.benchmarks.testgeneval.constants import (
|
||||
KEY_INSTANCE_ID,
|
||||
TestGenEvalInstance,
|
||||
)
|
||||
|
||||
|
||||
def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
"""
|
||||
Get test directives from the test_patch of a task instance
|
||||
|
||||
Args:
|
||||
instance (dict): task instance
|
||||
Returns:
|
||||
directives (list): List of test directives
|
||||
"""
|
||||
# For seq2seq code repos, testing command is fixed
|
||||
if instance['repo'] == 'swe-bench/humaneval':
|
||||
return ['test.py']
|
||||
|
||||
# Get test directives from test patch and remove non-test files
|
||||
directives = [f"/testbed/{instance['test_file']}"]
|
||||
|
||||
# For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
|
||||
if instance['repo'] == 'django/django':
|
||||
directives = [instance['test_file']]
|
||||
directives_transformed = []
|
||||
for d in directives:
|
||||
d = d[: -len('.py')] if d.endswith('.py') else d
|
||||
d = d[len('tests/') :] if d.startswith('tests/') else d
|
||||
d = d.replace('/', '.')
|
||||
directives_transformed.append(d)
|
||||
directives = directives_transformed
|
||||
|
||||
return directives
|
||||
|
||||
|
||||
def load_testgeneval_dataset(
|
||||
name='kjain14/testgeneval', split='test', ids=None
|
||||
) -> list[TestGenEvalInstance]:
|
||||
"""
|
||||
Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
|
||||
"""
|
||||
# check that all instance IDs are in the dataset
|
||||
if ids:
|
||||
ids = set(ids)
|
||||
# Load from local .json/.jsonl file
|
||||
if name.endswith('.json') or name.endswith('.jsonl'):
|
||||
dataset = json.loads(Path(name).read_text())
|
||||
dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
|
||||
else:
|
||||
# Load from Hugging Face Datasets
|
||||
if name.lower() in {'testgeneval'}:
|
||||
name = 'kjain14/testgeneval'
|
||||
elif name.lower() in {'testgeneval-lite', 'testgenevallite', 'lite'}:
|
||||
name = 'kjain14/testgenevallite'
|
||||
dataset = cast(Dataset, load_dataset(name, split=split))
|
||||
dataset_ids = {instance['id'] for instance in dataset}
|
||||
if ids:
|
||||
if ids - dataset_ids:
|
||||
raise ValueError(
|
||||
(
|
||||
"Some instance IDs not found in dataset!"
|
||||
f"\nMissing IDs:\n{' '.join(ids - dataset_ids)}"
|
||||
)
|
||||
)
|
||||
dataset = [instance for instance in dataset if instance['id'] in ids]
|
||||
return [cast(TestGenEvalInstance, instance) for instance in dataset]
|
||||
259
poetry.lock
generated
259
poetry.lock
generated
@@ -2008,6 +2008,21 @@ files = [
|
||||
{file = "func_timeout-4.3.5.tar.gz", hash = "sha256:74cd3c428ec94f4edfba81f9b2f14904846d5ffccc27c92433b8b5939b5575dd"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fuzzywuzzy"
|
||||
version = "0.18.0"
|
||||
description = "Fuzzy string matching in python"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["testgeneval"]
|
||||
files = [
|
||||
{file = "fuzzywuzzy-0.18.0-py2.py3-none-any.whl", hash = "sha256:928244b28db720d1e0ee7587acf660ea49d7e4c632569cad4f1cd7e68a5f0993"},
|
||||
{file = "fuzzywuzzy-0.18.0.tar.gz", hash = "sha256:45016e92264780e58972dca1b3d939ac864b78437422beecebb3095f8efd00e8"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
speedup = ["python-levenshtein (>=0.12)"]
|
||||
|
||||
[[package]]
|
||||
name = "gdown"
|
||||
version = "5.2.0"
|
||||
@@ -3739,6 +3754,107 @@ dev = ["changelist (==0.5)"]
|
||||
lint = ["pre-commit (==3.7.0)"]
|
||||
test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein"
|
||||
version = "0.26.1"
|
||||
description = "Python extension for computing string edit distances and similarities."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["testgeneval"]
|
||||
files = [
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8dc4a4aecad538d944a1264c12769c99e3c0bf8e741fc5e454cc954913befb2e"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec108f368c12b25787c8b1a4537a1452bc53861c3ee4abc810cc74098278edcd"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69229d651c97ed5b55b7ce92481ed00635cdbb80fbfb282a22636e6945dc52d5"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79dcd157046d62482a7719b08ba9e3ce9ed3fc5b015af8ea989c734c702aedd4"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f53f9173ae21b650b4ed8aef1d0ad0c37821f367c221a982f4d2922b3044e0d"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3956f3c5c229257dbeabe0b6aacd2c083ebcc1e335842a6ff2217fe6cc03b6b"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1e83af732726987d2c4cd736f415dae8b966ba17b7a2239c8b7ffe70bfb5543"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4f052c55046c2a9c9b5f742f39e02fa6e8db8039048b8c1c9e9fdd27c8a240a1"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9895b3a98f6709e293615fde0dcd1bb0982364278fa2072361a1a31b3e388b7a"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a3777de1d8bfca054465229beed23994f926311ce666f5a392c8859bb2722f16"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:81c57e1135c38c5e6e3675b5e2077d8a8d3be32bf0a46c57276c092b1dffc697"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:91d5e7d984891df3eff7ea9fec8cf06fdfacc03cd074fd1a410435706f73b079"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-win32.whl", hash = "sha256:f48abff54054b4142ad03b323e80aa89b1d15cabc48ff49eb7a6ff7621829a56"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-win_amd64.whl", hash = "sha256:79dd6ad799784ea7b23edd56e3bf94b3ca866c4c6dee845658ee75bb4aefdabf"},
|
||||
{file = "levenshtein-0.26.1-cp310-cp310-win_arm64.whl", hash = "sha256:3351ddb105ef010cc2ce474894c5d213c83dddb7abb96400beaa4926b0b745bd"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:44c51f5d33b3cfb9db518b36f1288437a509edd82da94c4400f6a681758e0cb6"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56b93203e725f9df660e2afe3d26ba07d71871b6d6e05b8b767e688e23dfb076"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:270d36c5da04a0d89990660aea8542227cbd8f5bc34e9fdfadd34916ff904520"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:480674c05077eeb0b0f748546d4fcbb386d7c737f9fff0010400da3e8b552942"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13946e37323728695ba7a22f3345c2e907d23f4600bc700bf9b4352fb0c72a48"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ceb673f572d1d0dc9b1cd75792bb8bad2ae8eb78a7c6721e23a3867d318cb6f2"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42d6fa242e3b310ce6bfd5af0c83e65ef10b608b885b3bb69863c01fb2fcff98"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b8b68295808893a81e0a1dbc2274c30dd90880f14d23078e8eb4325ee615fc68"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b01061d377d1944eb67bc40bef5d4d2f762c6ab01598efd9297ce5d0047eb1b5"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9d12c8390f156745e533d01b30773b9753e41d8bbf8bf9dac4b97628cdf16314"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:48825c9f967f922061329d1481b70e9fee937fc68322d6979bc623f69f75bc91"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d8ec137170b95736842f99c0e7a9fd8f5641d0c1b63b08ce027198545d983e2b"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-win32.whl", hash = "sha256:798f2b525a2e90562f1ba9da21010dde0d73730e277acaa5c52d2a6364fd3e2a"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-win_amd64.whl", hash = "sha256:55b1024516c59df55f1cf1a8651659a568f2c5929d863d3da1ce8893753153bd"},
|
||||
{file = "levenshtein-0.26.1-cp311-cp311-win_arm64.whl", hash = "sha256:e52575cbc6b9764ea138a6f82d73d3b1bc685fe62e207ff46a963d4c773799f6"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cc741ca406d3704dc331a69c04b061fc952509a069b79cab8287413f434684bd"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:821ace3b4e1c2e02b43cf5dc61aac2ea43bdb39837ac890919c225a2c3f2fea4"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92694c9396f55d4c91087efacf81297bef152893806fc54c289fc0254b45384"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:51ba374de7a1797d04a14a4f0ad3602d2d71fef4206bb20a6baaa6b6a502da58"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7aa5c3327dda4ef952769bacec09c09ff5bf426e07fdc94478c37955681885b"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e2517e8d3c221de2d1183f400aed64211fcfc77077b291ed9f3bb64f141cdc"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9092b622765c7649dd1d8af0f43354723dd6f4e570ac079ffd90b41033957438"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fc16796c85d7d8b259881d59cc8b5e22e940901928c2ff6924b2c967924e8a0b"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4370733967f5994ceeed8dc211089bedd45832ee688cecea17bfd35a9eb22b9"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3535ecfd88c9b283976b5bc61265855f59bba361881e92ed2b5367b6990c93fe"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:90236e93d98bdfd708883a6767826fafd976dac8af8fc4a0fb423d4fa08e1bf0"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:04b7cabb82edf566b1579b3ed60aac0eec116655af75a3c551fee8754ffce2ea"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-win32.whl", hash = "sha256:ae382af8c76f6d2a040c0d9ca978baf461702ceb3f79a0a3f6da8d596a484c5b"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-win_amd64.whl", hash = "sha256:fd091209798cfdce53746f5769987b4108fe941c54fb2e058c016ffc47872918"},
|
||||
{file = "levenshtein-0.26.1-cp312-cp312-win_arm64.whl", hash = "sha256:7e82f2ea44a81ad6b30d92a110e04cd3c8c7c6034b629aca30a3067fa174ae89"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:790374a9f5d2cbdb30ee780403a62e59bef51453ac020668c1564d1e43438f0e"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7b05c0415c386d00efda83d48db9db68edd02878d6dbc6df01194f12062be1bb"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3114586032361722ddededf28401ce5baf1cf617f9f49fb86b8766a45a423ff"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2532f8a13b68bf09f152d906f118a88da2063da22f44c90e904b142b0a53d534"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:219c30be6aa734bf927188d1208b7d78d202a3eb017b1c5f01ab2034d2d4ccca"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:397e245e77f87836308bd56305bba630010cd8298c34c4c44bd94990cdb3b7b1"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aeff6ea3576f72e26901544c6c55c72a7b79b9983b6f913cba0e9edbf2f87a97"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a19862e3539a697df722a08793994e334cd12791e8144851e8a1dee95a17ff63"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:dc3b5a64f57c3c078d58b1e447f7d68cad7ae1b23abe689215d03fc434f8f176"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bb6c7347424a91317c5e1b68041677e4c8ed3e7823b5bbaedb95bffb3c3497ea"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b817376de4195a207cc0e4ca37754c0e1e1078c2a2d35a6ae502afde87212f9e"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7b50c3620ff47c9887debbb4c154aaaac3e46be7fc2e5789ee8dbe128bce6a17"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-win32.whl", hash = "sha256:9fb859da90262eb474c190b3ca1e61dee83add022c676520f5c05fdd60df902a"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-win_amd64.whl", hash = "sha256:8adcc90e3a5bfb0a463581d85e599d950fe3c2938ac6247b29388b64997f6e2d"},
|
||||
{file = "levenshtein-0.26.1-cp313-cp313-win_arm64.whl", hash = "sha256:c2599407e029865dc66d210b8804c7768cbdbf60f061d993bb488d5242b0b73e"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dc54ced948fc3feafce8ad4ba4239d8ffc733a0d70e40c0363ac2a7ab2b7251e"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e6516f69213ae393a220e904332f1a6bfc299ba22cf27a6520a1663a08eba0fb"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4cfea4eada1746d0c75a864bc7e9e63d4a6e987c852d6cec8d9cb0c83afe25b"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a323161dfeeac6800eb13cfe76a8194aec589cd948bcf1cdc03f66cc3ec26b72"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c23e749b68ebc9a20b9047317b5cd2053b5856315bc8636037a8adcbb98bed1"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f80dd7432d4b6cf493d012d22148db7af769017deb31273e43406b1fb7f091c"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ae7cd6e4312c6ef34b2e273836d18f9fff518d84d823feff5ad7c49668256e0"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dcdad740e841d791b805421c2b20e859b4ed556396d3063b3aa64cd055be648c"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e07afb1613d6f5fd99abd4e53ad3b446b4efaa0f0d8e9dfb1d6d1b9f3f884d32"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:f1add8f1d83099a98ae4ac472d896b7e36db48c39d3db25adf12b373823cdeff"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1010814b1d7a60833a951f2756dfc5c10b61d09976ce96a0edae8fecdfb0ea7c"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:33fa329d1bb65ce85e83ceda281aea31cee9f2f6e167092cea54f922080bcc66"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-win32.whl", hash = "sha256:488a945312f2f16460ab61df5b4beb1ea2254c521668fd142ce6298006296c98"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-win_amd64.whl", hash = "sha256:9f942104adfddd4b336c3997050121328c39479f69de702d7d144abb69ea7ab9"},
|
||||
{file = "levenshtein-0.26.1-cp39-cp39-win_arm64.whl", hash = "sha256:c1d8f85b2672939f85086ed75effcf768f6077516a3e299c2ba1f91bc4644c22"},
|
||||
{file = "levenshtein-0.26.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6cf8f1efaf90ca585640c5d418c30b7d66d9ac215cee114593957161f63acde0"},
|
||||
{file = "levenshtein-0.26.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d5b2953978b8c158dd5cd93af8216a5cfddbf9de66cf5481c2955f44bb20767a"},
|
||||
{file = "levenshtein-0.26.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b952b3732c4631c49917d4b15d78cb4a2aa006c1d5c12e2a23ba8e18a307a055"},
|
||||
{file = "levenshtein-0.26.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07227281e12071168e6ae59238918a56d2a0682e529f747b5431664f302c0b42"},
|
||||
{file = "levenshtein-0.26.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8191241cd8934feaf4d05d0cc0e5e72877cbb17c53bbf8c92af9f1aedaa247e9"},
|
||||
{file = "levenshtein-0.26.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9e70d7ee157a9b698c73014f6e2b160830e7d2d64d2e342fefc3079af3c356fc"},
|
||||
{file = "levenshtein-0.26.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0eb3059f826f6cb0a5bca4a85928070f01e8202e7ccafcba94453470f83e49d4"},
|
||||
{file = "levenshtein-0.26.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:6c389e44da12d6fb1d7ba0a709a32a96c9391e9be4160ccb9269f37e040599ee"},
|
||||
{file = "levenshtein-0.26.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e9de292f2c51a7d34a0ae23bec05391b8f61f35781cd3e4c6d0533e06250c55"},
|
||||
{file = "levenshtein-0.26.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d87215113259efdca8716e53b6d59ab6d6009e119d95d45eccc083148855f33"},
|
||||
{file = "levenshtein-0.26.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18f00a3eebf68a82fb651d8d0e810c10bfaa60c555d21dde3ff81350c74fb4c2"},
|
||||
{file = "levenshtein-0.26.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b3554c1b59de63d05075577380340c185ff41b028e541c0888fddab3c259a2b4"},
|
||||
{file = "levenshtein-0.26.1.tar.gz", hash = "sha256:0d19ba22330d50609b2349021ec3cf7d905c6fe21195a2d0d876a146e7ed2575"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
rapidfuzz = ">=3.9.0,<4.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "libtmux"
|
||||
version = "0.39.0"
|
||||
@@ -6217,6 +6333,21 @@ files = [
|
||||
[package.extras]
|
||||
dev = ["backports.zoneinfo", "black", "build", "freezegun", "mdx_truly_sane_lists", "mike", "mkdocs", "mkdocs-awesome-pages-plugin", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-material (>=8.5)", "mkdocstrings[python]", "msgspec", "mypy", "orjson", "pylint", "pytest", "tzdata", "validate-pyproject[all]"]
|
||||
|
||||
[[package]]
|
||||
name = "python-levenshtein"
|
||||
version = "0.26.1"
|
||||
description = "Python extension for computing string edit distances and similarities."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["testgeneval"]
|
||||
files = [
|
||||
{file = "python_Levenshtein-0.26.1-py3-none-any.whl", hash = "sha256:8ef5e529dd640fb00f05ee62d998d2ee862f19566b641ace775d5ae16167b2ef"},
|
||||
{file = "python_levenshtein-0.26.1.tar.gz", hash = "sha256:24ba578e28058ebb4afa2700057e1678d7adf27e43cd1f17700c09a9009d5d3a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Levenshtein = "0.26.1"
|
||||
|
||||
[[package]]
|
||||
name = "python-multipart"
|
||||
version = "0.0.20"
|
||||
@@ -6555,6 +6686,113 @@ packaging = "*"
|
||||
[package.extras]
|
||||
test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"]
|
||||
|
||||
[[package]]
|
||||
name = "rapidfuzz"
|
||||
version = "3.12.2"
|
||||
description = "rapid fuzzy string matching"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["testgeneval"]
|
||||
files = [
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b9a75e0385a861178adf59e86d6616cbd0d5adca7228dc9eeabf6f62cf5b0b1"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6906a7eb458731e3dd2495af1d0410e23a21a2a2b7ced535e6d5cd15cb69afc5"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4b3334a8958b689f292d5ce8a928140ac98919b51e084f04bf0c14276e4c6ba"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:85a54ce30345cff2c79cbcffa063f270ad1daedd0d0c3ff6e541d3c3ba4288cf"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acb63c5072c08058f8995404201a52fc4e1ecac105548a4d03c6c6934bda45a3"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5385398d390c6571f0f2a7837e6ddde0c8b912dac096dc8c87208ce9aaaa7570"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5032cbffa245b4beba0067f8ed17392ef2501b346ae3c1f1d14b950edf4b6115"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:195adbb384d89d6c55e2fd71e7fb262010f3196e459aa2f3f45f31dd7185fe72"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f43b773a4d4950606fb25568ecde5f25280daf8f97b87eb323e16ecd8177b328"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:55a43be0e0fa956a919043c19d19bd988991d15c59f179d413fe5145ed9deb43"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:71cf1ea16acdebe9e2fb62ee7a77f8f70e877bebcbb33b34e660af2eb6d341d9"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a3692d4ab36d44685f61326dca539975a4eda49b2a76f0a3df177d8a2c0de9d2"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-win32.whl", hash = "sha256:09227bd402caa4397ba1d6e239deea635703b042dd266a4092548661fb22b9c6"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-win_amd64.whl", hash = "sha256:0f05b7b95f9f87254b53fa92048367a8232c26cee7fc8665e4337268c3919def"},
|
||||
{file = "rapidfuzz-3.12.2-cp310-cp310-win_arm64.whl", hash = "sha256:6938738e00d9eb6e04097b3f565097e20b0c398f9c58959a2bc64f7f6be3d9da"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e9c4d984621ae17404c58f8d06ed8b025e167e52c0e6a511dfec83c37e9220cd"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9f9132c55d330f0a1d34ce6730a76805323a6250d97468a1ca766a883d6a9a25"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b343b6cb4b2c3dbc8d2d4c5ee915b6088e3b144ddf8305a57eaab16cf9fc74"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24081077b571ec4ee6d5d7ea0e49bc6830bf05b50c1005028523b9cd356209f3"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c988a4fc91856260355773bf9d32bebab2083d4c6df33fafeddf4330e5ae9139"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:780b4469ee21cf62b1b2e8ada042941fd2525e45d5fb6a6901a9798a0e41153c"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd84b0a323885493c893bad16098c5e3b3005d7caa995ae653da07373665d97"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efa22059c765b3d8778083805b199deaaf643db070f65426f87d274565ddf36a"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:095776b11bb45daf7c2973dd61cc472d7ea7f2eecfa454aef940b4675659b92f"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7e2574cf4aa86065600b664a1ac7b8b8499107d102ecde836aaaa403fc4f1784"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d5a3425a6c50fd8fbd991d8f085ddb504791dae6ef9cc3ab299fea2cb5374bef"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fb05e1ddb7b71a054040af588b0634214ee87cea87900d309fafc16fd272a4"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-win32.whl", hash = "sha256:b4c5a0413589aef936892fbfa94b7ff6f7dd09edf19b5a7b83896cc9d4e8c184"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-win_amd64.whl", hash = "sha256:58d9ae5cf9246d102db2a2558b67fe7e73c533e5d769099747921232d88b9be2"},
|
||||
{file = "rapidfuzz-3.12.2-cp311-cp311-win_arm64.whl", hash = "sha256:7635fe34246cd241c8e35eb83084e978b01b83d5ef7e5bf72a704c637f270017"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1d982a651253ffe8434d9934ff0c1089111d60502228464721a2a4587435e159"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02e6466caa0222d5233b1f05640873671cd99549a5c5ba4c29151634a1e56080"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e956b3f053e474abae69ac693a52742109d860ac2375fe88e9387d3277f4c96c"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dee7d740a2d5418d4f964f39ab8d89923e6b945850db833e798a1969b19542a"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a057cdb0401e42c84b6516c9b1635f7aedd5e430c6e388bd5f6bcd1d6a0686bb"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dccf8d4fb5b86d39c581a59463c596b1d09df976da26ff04ae219604223d502f"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21d5b3793c6f5aecca595cd24164bf9d3c559e315ec684f912146fc4e769e367"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:46a616c0e13cff2de1761b011e0b14bb73b110182f009223f1453d505c9a975c"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19fa5bc4301a1ee55400d4a38a8ecf9522b0391fc31e6da5f4d68513fe5c0026"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:544a47190a0d25971658a9365dba7095397b4ce3e897f7dd0a77ca2cf6fa984e"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f21af27c5e001f0ba1b88c36a0936437dfe034c452548d998891c21125eb640f"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b63170d9db00629b5b3f2862114d8d6ee19127eaba0eee43762d62a25817dbe0"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-win32.whl", hash = "sha256:6c7152d77b2eb6bfac7baa11f2a9c45fd5a2d848dbb310acd0953b3b789d95c9"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-win_amd64.whl", hash = "sha256:1a314d170ee272ac87579f25a6cf8d16a031e1f7a7b07663434b41a1473bc501"},
|
||||
{file = "rapidfuzz-3.12.2-cp312-cp312-win_arm64.whl", hash = "sha256:d41e8231326e94fd07c4d8f424f6bed08fead6f5e6688d1e6e787f1443ae7631"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:941f31038dba5d3dedcfcceba81d61570ad457c873a24ceb13f4f44fcb574260"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fe2dfc454ee51ba168a67b1e92b72aad251e45a074972cef13340bbad2fd9438"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78fafaf7f5a48ee35ccd7928339080a0136e27cf97396de45259eca1d331b714"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0c7989ff32c077bb8fd53253fd6ca569d1bfebc80b17557e60750e6909ba4fe"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96fa00bc105caa34b6cd93dca14a29243a3a7f0c336e4dcd36348d38511e15ac"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bccfb30c668620c5bc3490f2dc7d7da1cca0ead5a9da8b755e2e02e2ef0dff14"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f9b0adc3d894beb51f5022f64717b6114a6fabaca83d77e93ac7675911c8cc5"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:32691aa59577f42864d5535cb6225d0f47e2c7bff59cf4556e5171e96af68cc1"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:758b10380ad34c1f51753a070d7bb278001b5e6fcf544121c6df93170952d705"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:50a9c54c0147b468363119132d514c5024fbad1ed8af12bd8bd411b0119f9208"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e3ceb87c11d2d0fbe8559bb795b0c0604b84cfc8bb7b8720b5c16e9e31e00f41"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f7c9a003002434889255ff5676ca0f8934a478065ab5e702f75dc42639505bba"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-win32.whl", hash = "sha256:cf165a76870cd875567941cf861dfd361a0a6e6a56b936c5d30042ddc9def090"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-win_amd64.whl", hash = "sha256:55bcc003541f5f16ec0a73bf6de758161973f9e8d75161954380738dd147f9f2"},
|
||||
{file = "rapidfuzz-3.12.2-cp313-cp313-win_arm64.whl", hash = "sha256:69f6ecdf1452139f2b947d0c169a605de578efdb72cbb2373cb0a94edca1fd34"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c852cd8bed1516a64fd6e2d4c6f270d4356196ee03fda2af1e5a9e13c34643"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42e7f747b55529a6d0d1588695d71025e884ab48664dca54b840413dea4588d8"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a749fd2690f24ef256b264a781487746bbb95344364fe8fe356f0eef7ef206ba"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9a11e1d036170bbafa43a9e63d8c309273564ec5bdfc5439062f439d1a16965a"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfb337f1832c1231e3d5621bd0ebebb854e46036aedae3e6a49c1fc08f16f249"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e88c6e68fca301722fa3ab7fd3ca46998012c14ada577bc1e2c2fc04f2067ca6"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17e1a3a8b4b5125cfb63a6990459b25b87ea769bdaf90d05bb143f8febef076a"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b9f8177b24ccc0a843e85932b1088c5e467a7dd7a181c13f84c684b796bea815"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6c506bdc2f304051592c0d3b0e82eed309248ec10cdf802f13220251358375ea"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:30bf15c1ecec2798b713d551df17f23401a3e3653ad9ed4e83ad1c2b06e86100"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:bd9a67cfc83e8453ef17ddd1c2c4ce4a74d448a197764efb54c29f29fb41f611"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7a6eaec2ef658dd650c6eb9b36dff7a361ebd7d8bea990ce9d639b911673b2cb"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-win32.whl", hash = "sha256:d7701769f110332cde45c41759cb2a497de8d2dca55e4c519a46aed5fbb19d1a"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-win_amd64.whl", hash = "sha256:296bf0fd4f678488670e262c87a3e4f91900b942d73ae38caa42a417e53643b1"},
|
||||
{file = "rapidfuzz-3.12.2-cp39-cp39-win_arm64.whl", hash = "sha256:7957f5d768de14f6b2715303ccdf224b78416738ee95a028a2965c95f73afbfb"},
|
||||
{file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5fd3ce849b27d063755829cda27a9dab6dbd63be3801f2a40c60ec563a4c90f"},
|
||||
{file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:54e53662d71ed660c83c5109127c8e30b9e607884b7c45d2aff7929bbbd00589"},
|
||||
{file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b9e43cf2213e524f3309d329f1ad8dbf658db004ed44f6ae1cd2919aa997da5"},
|
||||
{file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29ca445e320e5a8df3bd1d75b4fa4ecfa7c681942b9ac65b55168070a1a1960e"},
|
||||
{file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83eb7ef732c2f8533c6b5fbe69858a722c218acc3e1fc190ab6924a8af7e7e0e"},
|
||||
{file = "rapidfuzz-3.12.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:648adc2dd2cf873efc23befcc6e75754e204a409dfa77efd0fea30d08f22ef9d"},
|
||||
{file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9b1e6f48e1ffa0749261ee23a1c6462bdd0be5eac83093f4711de17a42ae78ad"},
|
||||
{file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:1ae9ded463f2ca4ba1eb762913c5f14c23d2e120739a62b7f4cc102eab32dc90"},
|
||||
{file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dda45f47b559be72ecbce45c7f71dc7c97b9772630ab0f3286d97d2c3025ab71"},
|
||||
{file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3745c6443890265513a3c8777f2de4cb897aeb906a406f97741019be8ad5bcc"},
|
||||
{file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36d3ef4f047ed1bc96fa29289f9e67a637ddca5e4f4d3dc7cb7f50eb33ec1664"},
|
||||
{file = "rapidfuzz-3.12.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:54bb69ebe5ca0bd7527357e348f16a4c0c52fe0c2fcc8a041010467dcb8385f7"},
|
||||
{file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3f2ddd5b99b254039a8c82be5749d4d75943f62eb2c2918acf6ffd586852834f"},
|
||||
{file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:8117dab9b26a1aaffab59b4e30f80ac4d55e61ad4139a637c149365960933bee"},
|
||||
{file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40c0f16d62d6553527de3dab2fb69709c4383430ea44bce8fb4711ed4cbc6ae3"},
|
||||
{file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f177e1eb6e4f5261a89c475e21bce7a99064a8f217d2336fb897408f46f0ceaf"},
|
||||
{file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df0cecc2852fcb078ed1b4482fac4fc2c2e7787f3edda8920d9a4c0f51b1c95"},
|
||||
{file = "rapidfuzz-3.12.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3b3c4df0321df6f8f0b61afbaa2ced9622750ee1e619128db57a18533d139820"},
|
||||
{file = "rapidfuzz-3.12.2.tar.gz", hash = "sha256:b0ba1ccc22fff782e7152a3d3d0caca44ec4e32dc48ba01c560b8593965b5aa3"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
all = ["numpy"]
|
||||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "5.2.1"
|
||||
@@ -6817,6 +7055,21 @@ pygments = ">=2.13.0,<3.0.0"
|
||||
[package.extras]
|
||||
jupyter = ["ipywidgets (>=7.5.1,<9)"]
|
||||
|
||||
[[package]]
|
||||
name = "rouge"
|
||||
version = "1.0.1"
|
||||
description = "Full Python ROUGE Score Implementation (not a wrapper)"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["testgeneval"]
|
||||
files = [
|
||||
{file = "rouge-1.0.1-py3-none-any.whl", hash = "sha256:28d118536e8c774dc47d1d15ec266479b4dd0914c4672ce117d4002789bdc644"},
|
||||
{file = "rouge-1.0.1.tar.gz", hash = "sha256:12b48346ca47d6bcf3c45061f315452b9ccec0620ee895ec85b7efc3d54aae34"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "rpds-py"
|
||||
version = "0.22.3"
|
||||
@@ -7354,7 +7607,7 @@ version = "1.17.0"
|
||||
description = "Python 2 and 3 compatibility utilities"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
|
||||
groups = ["main", "evaluation", "runtime", "test"]
|
||||
groups = ["main", "evaluation", "runtime", "test", "testgeneval"]
|
||||
files = [
|
||||
{file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"},
|
||||
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
|
||||
@@ -8112,7 +8365,7 @@ version = "0.23.6"
|
||||
description = "Python grammar for tree-sitter"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
groups = ["main", "testgeneval"]
|
||||
files = [
|
||||
{file = "tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb"},
|
||||
{file = "tree_sitter_python-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:680b710051b144fedf61c95197db0094f2245e82551bf7f0c501356333571f7a"},
|
||||
@@ -9056,4 +9309,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.12"
|
||||
content-hash = "9b74f62a4afa719a1f7167e0b3b45cdaf282c2e18fd2931da91c0f1b22776178"
|
||||
content-hash = "31c10902e2e52ca3ef7e3b0c7239f1ffa65f68a51fabaaa6b175124318a51d7b"
|
||||
|
||||
@@ -154,3 +154,9 @@ style = "semver"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
openhands = "openhands.core.cli:main"
|
||||
|
||||
[tool.poetry.group.testgeneval.dependencies]
|
||||
fuzzywuzzy = "^0.18.0"
|
||||
rouge = "^1.0.1"
|
||||
python-levenshtein = "^0.26.1"
|
||||
tree-sitter-python = "^0.23.6"
|
||||
|
||||
Reference in New Issue
Block a user