diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md index 958d6b7649..44efd1cb1f 100644 --- a/evaluation/swe_bench/README.md +++ b/evaluation/swe_bench/README.md @@ -51,6 +51,7 @@ sandbox_timeout = 120 use_host_network = false run_as_devin = false enable_auto_lint = true +max_budget_per_task = 4 # 4 USD # TODO: Change these to the model you want to evaluate [eval_gpt4_1106_preview] diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index d89835559d..fdd073e933 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -209,7 +209,7 @@ def process_instance( if reset_logger: # Set up logger log_file = os.path.join( - eval_output_dir, 'logs', f'instance_{instance.instance_id}.log' + eval_output_dir, 'infer_logs', f'instance_{instance.instance_id}.log' ) # Remove all existing handlers from logger for handler in logger.handlers[:]: