diff --git a/agenthub/SWE_agent/agent.py b/agenthub/SWE_agent/agent.py index 74840fb5e3..d62681579d 100644 --- a/agenthub/SWE_agent/agent.py +++ b/agenthub/SWE_agent/agent.py @@ -21,6 +21,7 @@ from .prompts import ( class SWEAgent(Agent): + VERSION = '1.0' """ An attempt to recreate swe_agent with output parsing, prompting style, and Application Computer Interface (ACI). diff --git a/agenthub/delegator_agent/agent.py b/agenthub/delegator_agent/agent.py index 42e8c5cedd..a6870c54c2 100644 --- a/agenthub/delegator_agent/agent.py +++ b/agenthub/delegator_agent/agent.py @@ -6,6 +6,7 @@ from opendevin.llm.llm import LLM class DelegatorAgent(Agent): + VERSION = '1.0' """ The planner agent utilizes a special prompting strategy to create long term plans for solving problems. The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step. diff --git a/agenthub/micro/agent.py b/agenthub/micro/agent.py index 08275e48d2..6e5a73e0d6 100644 --- a/agenthub/micro/agent.py +++ b/agenthub/micro/agent.py @@ -35,11 +35,14 @@ def history_to_json(obj, **kwargs): # process history, make it simpler. processed_history = [] for action, observation in obj: - processed_history.append((event_to_memory(action), event_to_memory(observation))) + processed_history.append( + (event_to_memory(action), event_to_memory(observation)) + ) return json.dumps(processed_history, **kwargs) class MicroAgent(Agent): + VERSION = '1.0' prompt = '' agent_definition: dict = {} diff --git a/agenthub/monologue_agent/agent.py b/agenthub/monologue_agent/agent.py index a08b5e119c..a8e7d73847 100644 --- a/agenthub/monologue_agent/agent.py +++ b/agenthub/monologue_agent/agent.py @@ -80,6 +80,7 @@ INITIAL_THOUGHTS = [ class MonologueAgent(Agent): + VERSION = '1.0' """ The Monologue Agent utilizes long and short term memory to complete tasks. Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past. diff --git a/agenthub/planner_agent/agent.py b/agenthub/planner_agent/agent.py index 0d78b479e1..3979b0a30c 100644 --- a/agenthub/planner_agent/agent.py +++ b/agenthub/planner_agent/agent.py @@ -7,6 +7,7 @@ from .prompt import get_prompt, parse_response class PlannerAgent(Agent): + VERSION = '1.0' """ The planner agent utilizes a special prompting strategy to create long term plans for solving problems. The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step. diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md index 036e2d972a..bd489af3d8 100644 --- a/evaluation/swe_bench/README.md +++ b/evaluation/swe_bench/README.md @@ -68,20 +68,42 @@ temperature = 0.0 ## Test if your environment works +Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.0` +docker image. Then run this python script: + ```bash -python3 evaluation/swe_bench/swe_env_box.py +poetry run python evaluation/swe_bench/swe_env_box.py ``` -If you get to the interactive shell successfully, it means success! +If you get to the interactive shell successfully, it means your environment works! +If you see an error, please make sure your `config.toml` contains all +`SWEBench eval specific` settings as shown in the previous section. ## Run Inference on SWE-Bench Instances ```bash -./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview +./evaluation/swe_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit] +# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 300 ``` -You can replace `eval_gpt4_1106_preview` with any model you setted up in `config.toml`. +where `model_config` is mandatory, while `agent` and `eval_limit` are optional. +`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your +LLM settings, as defined in your `config.toml`. + +`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting +to `CodeActAgent`. + +`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By +default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note: +in order to use `eval_limit`, you must also set `agent`. + +Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and CodeActAgent, +then your command would be: + +```bash +./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10 +``` ## Evaluate Generated Patches diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 84d9bc9331..7dc684e106 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -184,7 +184,12 @@ def get_test_result(instance, sandbox, workspace_dir_name): def process_instance( - instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True + instance, + agent_class, + metadata, + skip_workspace_mount, + eval_output_dir, + reset_logger: bool = True, ): workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace') # create process-specific workspace dir @@ -206,7 +211,7 @@ def process_instance( # add back the console handler to print ONE line logger.addHandler(get_console_handler()) logger.info( - f'Starting evaluation for instance {instance.instance_id}.\nLOG: tail -f {log_file}' + f'Starting evaluation for instance {instance.instance_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell' ) # Remove all existing handlers from logger for handler in logger.handlers[:]: @@ -417,6 +422,7 @@ if __name__ == '__main__': agent_class, metadata, skip_workspace_mount, + eval_output_dir, reset_logger=bool(num_workers > 1), ) future.add_done_callback(update_progress) diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh index a024cf6d14..73ac7ea988 100755 --- a/evaluation/swe_bench/scripts/run_infer.sh +++ b/evaluation/swe_bench/scripts/run_infer.sh @@ -1,21 +1,33 @@ #!/bin/bash +MODEL_CONFIG=$1 +AGENT=$2 +EVAL_LIMIT=$3 + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi -AGENT=CodeActAgent # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin # We need to track the version of Agent in the evaluation to make sure results are comparable -AGENT_VERSION=v$(python3 -c "from agenthub.codeact_agent import CodeActAgent; print(CodeActAgent.VERSION)") -MODEL_CONFIG=$1 +AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)") echo "AGENT: $AGENT" echo "AGENT_VERSION: $AGENT_VERSION" echo "MODEL_CONFIG: $MODEL_CONFIG" -# You should add $MODEL_CONFIG in your `config.toml` - -poetry run python3 evaluation/swe_bench/run_infer.py \ +COMMAND="poetry run python evaluation/swe_bench/run_infer.py \ --agent-cls $AGENT \ --llm-config $MODEL_CONFIG \ --max-iterations 50 \ --max-chars 10000000 \ --eval-num-workers 8 \ - --eval-note $AGENT_VERSION + --eval-note $AGENT_VERSION" + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/opendevin/core/config.py b/opendevin/core/config.py index c10de725b4..5471217b07 100644 --- a/opendevin/core/config.py +++ b/opendevin/core/config.py @@ -379,6 +379,7 @@ def get_parser(): type=int, help='The maximum number of characters to send to and receive from LLM per task', ) + # --eval configs are for evaluations only parser.add_argument( '--eval-output-dir', default='evaluation/evaluation_outputs/outputs',