diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
index 6457e11288..9fa3001b60 100644
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -177,9 +177,10 @@ def process_instance(
         signature_file=f'{instance.instance_name}.py',
     )
     if USE_UNIT_TESTS:
+        print(f'\nInstruction to run test_file: {instance.instance_name}_test.py\n')
         instruction += (
-            f'Use the test_file: {instance.instance_name}_test.py, to verify '
-            'the correctness of your solution. DO NOT EDIT the test file.\n\n'
+            f'Use `python -m unittest {instance.instance_name}_test.py` to run the test_file '
+            'and verify the correctness of your solution. DO NOT EDIT the test file.\n\n'
         )
 
     instruction += (
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 7a91402571..982ea42323 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -139,13 +139,14 @@ def make_metadata(
     details: dict[str, Any] | None = None,
 ) -> EvalMetadata:
     model_name = llm_config.model.split('/')[-1]
+    model_path = model_name.replace(':', '_')
     eval_note = f'_N_{eval_note}' if eval_note else ''
 
     eval_output_path = os.path.join(
         eval_output_dir,
         dataset_name,
         agent_class,
-        f'{model_name}_maxiter_{max_iterations}{eval_note}',
+        f'{model_path}_maxiter_{max_iterations}{eval_note}',
     )
 
     pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)