mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-08 22:38:05 -05:00
408 lines
14 KiB
Bash
Executable File
408 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
set -eo pipefail
|
|
|
|
##############################################################
|
|
## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
|
|
##############################################################
|
|
|
|
echo -e "\n\n============================================================"
|
|
|
|
# unset environmental variables that might disturb testing
|
|
unset OPENAI_API_KEY
|
|
unset SANDBOX_ENV_OPENAI_API_KEY
|
|
unset OPENAI_BASE_URL
|
|
unset OPENAI_MODEL
|
|
|
|
# Get the absolute path of the script directory
|
|
get_script_dir() {
|
|
local source="${BASH_SOURCE[0]}"
|
|
while [ -h "$source" ]; do
|
|
local dir="$( cd -P "$( dirname "$source" )" && pwd )"
|
|
source="$(readlink -f "$source" 2>/dev/null || echo "$source")"
|
|
[[ $source != /* ]] && source="$dir/$source"
|
|
done
|
|
echo "$( cd -P "$( dirname "$source" )" && pwd )"
|
|
}
|
|
|
|
TMP_FILE="${TMP_FILE:-tmp.log}"
|
|
|
|
if [ -z "$WORKSPACE_BASE" ]; then
|
|
WORKSPACE_BASE=$(pwd)
|
|
fi
|
|
|
|
DEBUG=true # needed for llm logging to create mock files!
|
|
|
|
if [ -z "$LOG_TO_FILE" ]; then
|
|
LOG_TO_FILE=true
|
|
fi
|
|
|
|
export SCRIPT_DIR=$(get_script_dir)
|
|
export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
|
|
export LOG_DIR="$PROJECT_ROOT/logs"
|
|
echo "Current working directory: $(pwd)"
|
|
echo "SCRIPT_DIR: $SCRIPT_DIR"
|
|
echo "PROJECT_ROOT: $PROJECT_ROOT"
|
|
echo "LOG_DIR: $LOG_DIR"
|
|
echo "LOG_TO_FILE: $LOG_TO_FILE"
|
|
|
|
WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
|
|
mkdir -p "$WORKSPACE_BASE"
|
|
chmod -R 777 "$WORKSPACE_BASE"
|
|
WORKSPACE_BASE=$(realpath "$WORKSPACE_BASE")
|
|
|
|
if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
|
|
WORKSPACE_MOUNT_PATH="$WORKSPACE_BASE"
|
|
else
|
|
WORKSPACE_MOUNT_PATH="${WORKSPACE_MOUNT_PATH}/_test_workspace"
|
|
mkdir -p "$WORKSPACE_MOUNT_PATH"
|
|
chmod -R 755 "$WORKSPACE_MOUNT_PATH"
|
|
WORKSPACE_MOUNT_PATH=$(realpath "$WORKSPACE_MOUNT_PATH")
|
|
fi
|
|
|
|
WORKSPACE_MOUNT_PATH_IN_SANDBOX="${WORKSPACE_MOUNT_PATH_IN_SANDBOX:-/workspace}"
|
|
|
|
echo "WORKSPACE_BASE: $WORKSPACE_BASE"
|
|
echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
|
|
echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"
|
|
|
|
# Ensure we're in the correct directory
|
|
cd "$PROJECT_ROOT" || exit 1
|
|
|
|
mkdir -p "$WORKSPACE_BASE"
|
|
|
|
# use environmental variable if exists
|
|
TEST_RUNTIME="${TEST_RUNTIME:-eventstream}"
|
|
if [ -z "$SANDBOX_BASE_CONTAINER_IMAGE" ]; then
|
|
SANDBOX_BASE_CONTAINER_IMAGE="nikolaik/python-nodejs:python3.11-nodejs22"
|
|
fi
|
|
|
|
MAX_ITERATIONS=20
|
|
echo "TEST_RUNTIME: $TEST_RUNTIME"
|
|
|
|
agents=(
|
|
"DelegatorAgent"
|
|
"ManagerAgent"
|
|
"BrowsingAgent"
|
|
"CodeActAgent"
|
|
"PlannerAgent"
|
|
"CodeActSWEAgent"
|
|
)
|
|
tasks=(
|
|
"Fix typos in bad.txt."
|
|
"Write a shell script 'hello.sh' that prints 'hello'."
|
|
"Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
|
|
"Write a git commit message for the current staging area."
|
|
#"Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
|
|
"Browse localhost:8000, and tell me the ultimate answer to life."
|
|
)
|
|
test_names=(
|
|
"test_edits"
|
|
"test_write_simple_script"
|
|
"test_ipython"
|
|
"test_simple_task_rejection"
|
|
#"test_ipython_module" NOT DETERMINISTIC IN NUMBER OF LLM RESPONSES!
|
|
"test_browse_internet"
|
|
)
|
|
|
|
num_of_tests=${#test_names[@]}
|
|
num_of_agents=${#agents[@]}
|
|
|
|
##############################################################
|
|
## FUNCTIONS ##
|
|
##############################################################
|
|
|
|
# run integration test against a specific agent & test
|
|
run_test() {
|
|
# Ensure we're in the correct directory
|
|
cd "$PROJECT_ROOT" || exit 1
|
|
|
|
local pytest_cmd="poetry run pytest --cache-clear -vvsxx $SCRIPT_DIR/test_agent.py::$test_name"
|
|
# Check if TEST_IN_CI is defined
|
|
if [ -n "$TEST_IN_CI" ]; then
|
|
pytest_cmd+=" --cov=agenthub --cov=openhands --cov-report=xml --cov-append"
|
|
fi
|
|
|
|
env SCRIPT_DIR="$SCRIPT_DIR" \
|
|
PROJECT_ROOT="$PROJECT_ROOT" \
|
|
WORKSPACE_BASE="$WORKSPACE_BASE" \
|
|
WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
|
|
MAX_ITERATIONS="$MAX_ITERATIONS" \
|
|
DEFAULT_AGENT=$agent \
|
|
TEST_RUNTIME="$TEST_RUNTIME" \
|
|
DEBUG=$DEBUG \
|
|
LLM=$LLM \
|
|
LOG_TO_FILE=$LOG_TO_FILE \
|
|
FORCE_REGENERATE=$FORCE_REGENERATE \
|
|
SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
|
|
$pytest_cmd 2>&1 | tee "$TMP_FILE"
|
|
|
|
# Capture the exit code of pytest
|
|
pytest_exit_code=${PIPESTATUS[0]}
|
|
|
|
if grep -q "docker.errors.DockerException" $TMP_FILE; then
|
|
echo "Error: docker.errors.DockerException found in the output. Exiting."
|
|
echo "Please check if your Docker daemon is running!"
|
|
exit 1
|
|
fi
|
|
|
|
if grep -q "tenacity.RetryError" $TMP_FILE; then
|
|
echo "Error: tenacity.RetryError found in the output. Exiting."
|
|
echo "This is mostly a transient error. Please retry."
|
|
exit 1
|
|
fi
|
|
|
|
if grep -q "ExceptionPxssh" $TMP_FILE; then
|
|
echo "Error: ExceptionPxssh found in the output. Exiting."
|
|
echo "Could not connect to sandbox via ssh. Please stop any stale docker container and retry."
|
|
exit 1
|
|
fi
|
|
|
|
if grep -q "Address already in use" $TMP_FILE; then
|
|
echo "Error: Address already in use found in the output. Exiting."
|
|
echo "Browsing tests need a local http server. Please check if there's any zombie process running start_http_server.py."
|
|
exit 1
|
|
fi
|
|
|
|
# Return the exit code of pytest
|
|
return $pytest_exit_code
|
|
}
|
|
|
|
# browsing capability needs a local http server
|
|
launch_http_server() {
|
|
poetry run python $SCRIPT_DIR/start_http_server.py &
|
|
HTTP_SERVER_PID=$!
|
|
echo "Test http server launched, PID = $HTTP_SERVER_PID"
|
|
sleep 5
|
|
}
|
|
|
|
cleanup() {
|
|
cd "$PROJECT_ROOT/tests"
|
|
cd "$PROJECT_ROOT"
|
|
echo "Cleaning up before exit..."
|
|
if [ -n "$HTTP_SERVER_PID" ]; then
|
|
echo "Killing HTTP server..."
|
|
kill $HTTP_SERVER_PID || true
|
|
unset HTTP_SERVER_PID
|
|
fi
|
|
[ -f "$TMP_FILE" ] && rm "$TMP_FILE"
|
|
echo "Cleanup done!"
|
|
}
|
|
|
|
# Trap the EXIT signal to run the cleanup function
|
|
if [ -z "$NOTRAP" ]; then
|
|
trap cleanup EXIT
|
|
fi
|
|
|
|
# generate prompts again, using existing LLM responses under tests/integration/mock/[test_runtime]_runtime/[agent]/[test_name]/response_*.log
|
|
# this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
|
|
# respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
|
|
# apply a small change to prompts.
|
|
regenerate_without_llm() {
|
|
cd "$PROJECT_ROOT"
|
|
|
|
# set -x to print the command being executed
|
|
set -x
|
|
env SCRIPT_DIR="$SCRIPT_DIR" \
|
|
PROJECT_ROOT="$PROJECT_ROOT" \
|
|
WORKSPACE_BASE="$WORKSPACE_BASE" \
|
|
WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
|
|
MAX_ITERATIONS="$MAX_ITERATIONS" \
|
|
FORCE_APPLY_PROMPTS=true \
|
|
DEFAULT_AGENT="$agent" \
|
|
TEST_RUNTIME="$TEST_RUNTIME" \
|
|
LLM="$LLM" \
|
|
DEBUG="$DEBUG" \
|
|
LOG_TO_FILE="$LOG_TO_FILE" \
|
|
FORCE_REGENERATE="$FORCE_REGENERATE" \
|
|
SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
|
|
poetry run pytest -s "$SCRIPT_DIR/test_agent.py::$test_name"
|
|
set +x
|
|
}
|
|
|
|
regenerate_with_llm() {
|
|
cd "$PROJECT_ROOT"
|
|
|
|
rm -rf "$WORKSPACE_BASE/*"
|
|
if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
|
|
cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
|
|
fi
|
|
|
|
rm -rf "$LOG_DIR"
|
|
rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
|
|
# set -x to print the command being executed
|
|
set -x
|
|
echo -e "/exit\n" | \
|
|
env SCRIPT_DIR="$SCRIPT_DIR" \
|
|
PROJECT_ROOT="$PROJECT_ROOT" \
|
|
WORKSPACE_BASE="$WORKSPACE_BASE" \
|
|
WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
|
|
DEFAULT_AGENT=$agent \
|
|
RUNTIME="$TEST_RUNTIME" \
|
|
SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
|
|
LLM="$LLM" \
|
|
DEBUG="$DEBUG" \
|
|
LOG_TO_FILE="$LOG_TO_FILE" \
|
|
FORCE_REGENERATE="$FORCE_REGENERATE" \
|
|
poetry run python "$PROJECT_ROOT/openhands/core/main.py" \
|
|
-i "$MAX_ITERATIONS" \
|
|
-t "$task Do not ask me for confirmation at any point." \
|
|
-c $agent
|
|
set +x
|
|
|
|
mkdir -p "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
|
|
mv "$LOG_DIR"/llm/**/* "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
|
|
}
|
|
|
|
##############################################################
|
|
## MAIN PROGRAM ##
|
|
##############################################################
|
|
|
|
if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
|
|
echo "Every task must correspond to one test case"
|
|
exit 1
|
|
fi
|
|
|
|
rm -rf "$LOG_DIR"
|
|
rm -rf "$WORKSPACE_BASE/*"
|
|
for ((i = 0; i < num_of_tests; i++)); do
|
|
task=${tasks[i]}
|
|
test_name=${test_names[i]}
|
|
|
|
# skip other tests if only one test is specified
|
|
if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
|
|
continue
|
|
fi
|
|
|
|
if [ "$test_name" = "test_browse_internet" ]; then
|
|
launch_http_server
|
|
fi
|
|
|
|
for ((j = 0; j < num_of_agents; j++)); do
|
|
agent=${agents[j]}
|
|
|
|
# skip other agents if only one agent is specified
|
|
if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
|
|
continue
|
|
fi
|
|
|
|
echo -e "\n============================================================"
|
|
echo -e "======== STEP 1: Running $test_name for $agent"
|
|
echo -e "============================================================\n\n"
|
|
# reset dir so getcwd() shouldn't fail
|
|
cd "$PROJECT_ROOT/tests"
|
|
cd "$PROJECT_ROOT"
|
|
|
|
rm -rf "$WORKSPACE_BASE/*"
|
|
if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
|
|
cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
|
|
fi
|
|
|
|
if [ "$TEST_ONLY" ]; then
|
|
set -e
|
|
else
|
|
# Temporarily disable 'exit on error'
|
|
set +e
|
|
fi
|
|
|
|
TEST_STATUS=1
|
|
if [ -z "$FORCE_REGENERATE" ]; then
|
|
run_test
|
|
TEST_STATUS=$?
|
|
fi
|
|
# Re-enable 'exit on error'
|
|
set -e
|
|
|
|
if [[ $TEST_STATUS -ne 0 ]]; then
|
|
|
|
if [ "$FORCE_USE_LLM" ]; then
|
|
echo -e "\n============================================================"
|
|
echo -e "======== FORCE_USE_LLM, skipping step 2 & 3"
|
|
echo -e "============================================================\n\n"
|
|
elif [ ! -d "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name" ]; then
|
|
echo -e "\n============================================================"
|
|
echo -e "======== No existing mock responses for ${TEST_RUNTIME}_runtime/$agent/$test_name, skipping step 2 & 3"
|
|
echo -e "============================================================\n\n"
|
|
else
|
|
echo -e "\n============================================================"
|
|
echo -e "======== STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost"
|
|
echo -e "============================================================\n\n"
|
|
|
|
# Temporarily disable 'exit on error'
|
|
set +e
|
|
regenerate_without_llm
|
|
|
|
echo -e "\n============================================================"
|
|
echo -e "======== STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify"
|
|
echo -e "============================================================\n\n\n"
|
|
run_test
|
|
TEST_STATUS=$?
|
|
# Re-enable 'exit on error'
|
|
set -e
|
|
fi
|
|
|
|
if [[ $TEST_STATUS -ne 0 ]]; then
|
|
echo -e "\n============================================================"
|
|
if [ "$FORCE_USE_LLM" ]; then
|
|
echo -e "======== STEP 4: $test_name REGENERATION for $agent WITH money cost"
|
|
else
|
|
echo -e "======== STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost"
|
|
fi
|
|
echo -e "============================================================\n\n\n"
|
|
|
|
regenerate_with_llm
|
|
|
|
echo -e "\n============================================================"
|
|
echo -e "======== STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify"
|
|
echo -e "============================================================\n\n\n"
|
|
cd "$PROJECT_ROOT/tests"
|
|
cd "$PROJECT_ROOT"
|
|
# Temporarily disable 'exit on error'
|
|
set +e
|
|
run_test
|
|
TEST_STATUS=$?
|
|
# Re-enable 'exit on error'
|
|
set -e
|
|
|
|
if [[ $TEST_STATUS -ne 0 ]]; then
|
|
echo -e "\n\n============================================================"
|
|
echo -e "========== $test_name for $agent RERUN FAILED"
|
|
echo -e "============================================================"
|
|
echo -e "There are multiple possibilities:"
|
|
echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
|
|
echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
|
|
echo -e " 3. There is something non-deterministic in the prompt."
|
|
echo -e " 4. There is a bug in this script, or in OpenHands code."
|
|
echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
|
|
echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
|
|
echo -e "============================================================\n\n"
|
|
exit 1
|
|
else
|
|
echo -e "\n\n============================================================"
|
|
echo -e "========$test_name for $agent RERUN PASSED"
|
|
echo -e "============================================================\n\n\n"
|
|
sleep 1
|
|
fi
|
|
else
|
|
echo -e "\n\n============================================================"
|
|
echo -e "========$test_name for $agent RERUN PASSED"
|
|
echo -e "============================================================\n\n\n"
|
|
sleep 1
|
|
fi
|
|
else
|
|
echo -e "\n\n============================================================"
|
|
echo -e "\n========== $test_name for $agent PASSED"
|
|
echo -e "\n============================================================\n\n\n"
|
|
sleep 1
|
|
fi
|
|
done
|
|
|
|
if [ "$test_name" = "test_browse_internet" ]; then
|
|
kill $HTTP_SERVER_PID || true
|
|
fi
|
|
done
|
|
|
|
rm -rf "$LOG_DIR"
|
|
rm -rf "$WORKSPACE_BASE"
|
|
echo "Done!"
|
|
cd "$PROJECT_ROOT"
|