diff --git a/agenthub/SWE_agent/agent.py b/agenthub/SWE_agent/agent.py
index 74840fb5e3..d62681579d 100644
--- a/agenthub/SWE_agent/agent.py
+++ b/agenthub/SWE_agent/agent.py
@@ -21,6 +21,7 @@ from .prompts import (
 
 
 class SWEAgent(Agent):
+    VERSION = '1.0'
     """
     An attempt to recreate swe_agent with output parsing, prompting style, and Application Computer Interface (ACI).
 
diff --git a/agenthub/delegator_agent/agent.py b/agenthub/delegator_agent/agent.py
index 42e8c5cedd..a6870c54c2 100644
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -6,6 +6,7 @@ from opendevin.llm.llm import LLM
 
 
 class DelegatorAgent(Agent):
+    VERSION = '1.0'
     """
     The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
     The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
diff --git a/agenthub/micro/agent.py b/agenthub/micro/agent.py
index 08275e48d2..6e5a73e0d6 100644
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -35,11 +35,14 @@ def history_to_json(obj, **kwargs):
         # process history, make it simpler.
         processed_history = []
         for action, observation in obj:
-            processed_history.append((event_to_memory(action), event_to_memory(observation)))
+            processed_history.append(
+                (event_to_memory(action), event_to_memory(observation))
+            )
         return json.dumps(processed_history, **kwargs)
 
 
 class MicroAgent(Agent):
+    VERSION = '1.0'
     prompt = ''
     agent_definition: dict = {}
 
diff --git a/agenthub/monologue_agent/agent.py b/agenthub/monologue_agent/agent.py
index a08b5e119c..a8e7d73847 100644
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -80,6 +80,7 @@ INITIAL_THOUGHTS = [
 
 
 class MonologueAgent(Agent):
+    VERSION = '1.0'
     """
     The Monologue Agent utilizes long and short term memory to complete tasks.
     Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past.
diff --git a/agenthub/planner_agent/agent.py b/agenthub/planner_agent/agent.py
index 0d78b479e1..3979b0a30c 100644
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@@ -7,6 +7,7 @@ from .prompt import get_prompt, parse_response
 
 
 class PlannerAgent(Agent):
+    VERSION = '1.0'
     """
     The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
     The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md
index 036e2d972a..bd489af3d8 100644
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -68,20 +68,42 @@ temperature = 0.0
 
 ## Test if your environment works
 
+Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.0`
+docker image. Then run this python script:
+
 ```bash
-python3 evaluation/swe_bench/swe_env_box.py
+poetry run python evaluation/swe_bench/swe_env_box.py
 ```
 
-If you get to the interactive shell successfully, it means success!
+If you get to the interactive shell successfully, it means your environment works!
+If you see an error, please make sure your `config.toml` contains all
+`SWEBench eval specific` settings as shown in the previous section.
 
 ## Run Inference on SWE-Bench Instances
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 300
 ```
 
-You can replace `eval_gpt4_1106_preview` with any model you setted up in `config.toml`.
+where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
 
+`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
+default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
+in order to use `eval_limit`, you must also set `agent`.
+
+Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and CodeActAgent,
+then your command would be:
+
+```bash
+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10
+```
 
 ## Evaluate Generated Patches
 
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 84d9bc9331..7dc684e106 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -184,7 +184,12 @@ def get_test_result(instance, sandbox, workspace_dir_name):
 
 
 def process_instance(
-    instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
+    instance,
+    agent_class,
+    metadata,
+    skip_workspace_mount,
+    eval_output_dir,
+    reset_logger: bool = True,
 ):
     workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
     # create process-specific workspace dir
@@ -206,7 +211,7 @@ def process_instance(
         # add back the console handler to print ONE line
         logger.addHandler(get_console_handler())
         logger.info(
-            f'Starting evaluation for instance {instance.instance_id}.\nLOG:   tail -f {log_file}'
+            f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
         )
         # Remove all existing handlers from logger
         for handler in logger.handlers[:]:
@@ -417,6 +422,7 @@ if __name__ == '__main__':
                     agent_class,
                     metadata,
                     skip_workspace_mount,
+                    eval_output_dir,
                     reset_logger=bool(num_workers > 1),
                 )
                 future.add_done_callback(update_progress)
diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh
index a024cf6d14..73ac7ea988 100755
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -1,21 +1,33 @@
 #!/bin/bash
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
 
-AGENT=CodeActAgent
 # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
 # We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(python3 -c "from agenthub.codeact_agent import CodeActAgent; print(CodeActAgent.VERSION)")
-MODEL_CONFIG=$1
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-# You should add $MODEL_CONFIG in your `config.toml`
-
-poetry run python3 evaluation/swe_bench/run_infer.py \
+COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 50 \
   --max-chars 10000000 \
   --eval-num-workers 8 \
-  --eval-note $AGENT_VERSION
+  --eval-note $AGENT_VERSION"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
diff --git a/opendevin/core/config.py b/opendevin/core/config.py
index c10de725b4..5471217b07 100644
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@@ -379,6 +379,7 @@ def get_parser():
         type=int,
         help='The maximum number of characters to send to and receive from LLM per task',
     )
+    # --eval configs are for evaluations only
     parser.add_argument(
         '--eval-output-dir',
         default='evaluation/evaluation_outputs/outputs',