diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py index 6457e11288..9fa3001b60 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/aider_bench/run_infer.py @@ -177,9 +177,10 @@ def process_instance( signature_file=f'{instance.instance_name}.py', ) if USE_UNIT_TESTS: + print(f'\nInstruction to run test_file: {instance.instance_name}_test.py\n') instruction += ( - f'Use the test_file: {instance.instance_name}_test.py, to verify ' - 'the correctness of your solution. DO NOT EDIT the test file.\n\n' + f'Use `python -m unittest {instance.instance_name}_test.py` to run the test_file ' + 'and verify the correctness of your solution. DO NOT EDIT the test file.\n\n' ) instruction += ( diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 7a91402571..982ea42323 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -139,13 +139,14 @@ def make_metadata( details: dict[str, Any] | None = None, ) -> EvalMetadata: model_name = llm_config.model.split('/')[-1] + model_path = model_name.replace(':', '_') eval_note = f'_N_{eval_note}' if eval_note else '' eval_output_path = os.path.join( eval_output_dir, dataset_name, agent_class, - f'{model_name}_maxiter_{max_iterations}{eval_note}', + f'{model_path}_maxiter_{max_iterations}{eval_note}', ) pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)