mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
55 Commits
github-tok
...
eval/24-se
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7da6e06da6 | ||
|
|
c2223a0fe4 | ||
|
|
f2a48a870c | ||
|
|
61d99e9e37 | ||
|
|
9af6399a90 | ||
|
|
ac1459b0c9 | ||
|
|
e5c5e1c4e5 | ||
|
|
cc03b59238 | ||
|
|
6999d969bb | ||
|
|
f446237081 | ||
|
|
891b02d1ce | ||
|
|
78cbd90df0 | ||
|
|
4ae0a3c887 | ||
|
|
6d9385baa2 | ||
|
|
7eb44cdeff | ||
|
|
5a64cf2bca | ||
|
|
b24a7821ec | ||
|
|
caa0f03c7b | ||
|
|
e0f91f2aef | ||
|
|
5d1355ffa0 | ||
|
|
4c3068c711 | ||
|
|
68b2152942 | ||
|
|
b7416a4723 | ||
|
|
770af8d74b | ||
|
|
090f0df452 | ||
|
|
c92cbbb201 | ||
|
|
ee37af93a1 | ||
|
|
e09e8b4ebf | ||
|
|
b96d798efa | ||
|
|
9a9d376772 | ||
|
|
9e2a693ed4 | ||
|
|
cc3c34c90a | ||
|
|
279443a563 | ||
|
|
8a9d9576a9 | ||
|
|
79867629db | ||
|
|
963f0db6ab | ||
|
|
4e93a24e44 | ||
|
|
20722da8ca | ||
|
|
b02c98f683 | ||
|
|
44b5bffd34 | ||
|
|
b720eceb59 | ||
|
|
fb6da23220 | ||
|
|
d843fb8bab | ||
|
|
33c5cdeb93 | ||
|
|
460aa3acbd | ||
|
|
4ae8f9cf05 | ||
|
|
2c7b214a74 | ||
|
|
283ef9becc | ||
|
|
369ceecc63 | ||
|
|
fe5a67e96d | ||
|
|
cf5da84b6f | ||
|
|
a314309b57 | ||
|
|
a42cc05481 | ||
|
|
e0cdaa2a58 | ||
|
|
5fa8fde2f0 |
65
.github/workflows/ghcr_runtime.yml
vendored
65
.github/workflows/ghcr_runtime.yml
vendored
@@ -26,6 +26,67 @@ on:
|
||||
default: ''
|
||||
|
||||
jobs:
|
||||
# Builds the OpenHands Docker images
|
||||
ghcr_build_app:
|
||||
name: Build App Image
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
outputs:
|
||||
hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
uses: jlumbroso/free-disk-space@main
|
||||
with:
|
||||
# this might remove tools that are actually needed,
|
||||
# if set to "true" but frees about 6 GB
|
||||
tool-cache: true
|
||||
# all of these default to true, but feel free to set to
|
||||
# "false" if necessary for your workflow
|
||||
android: true
|
||||
dotnet: true
|
||||
haskell: true
|
||||
large-packages: true
|
||||
docker-images: false
|
||||
swap-storage: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3.0.0
|
||||
with:
|
||||
image: tonistiigi/binfmt:latest
|
||||
- name: Login to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Build and push app image
|
||||
if: "!github.event.pull_request.head.repo.fork"
|
||||
run: |
|
||||
./containers/build.sh openhands ${{ github.repository_owner }} --push
|
||||
- name: Build app image
|
||||
if: "github.event.pull_request.head.repo.fork"
|
||||
run: |
|
||||
./containers/build.sh openhands image ${{ github.repository_owner }}
|
||||
- name: Get hash in App Image
|
||||
id: get_hash_in_app_image
|
||||
run: |
|
||||
# Lowercase the repository owner
|
||||
export REPO_OWNER=${{ github.repository_owner }}
|
||||
REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
|
||||
# Run the build script in the app image
|
||||
docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ github.sha }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
|
||||
# Get the hash from the build script
|
||||
hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
|
||||
echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
|
||||
echo "Hash from app image: $hash_from_app_image"
|
||||
|
||||
|
||||
# Builds the runtime Docker images
|
||||
ghcr_build_runtime:
|
||||
name: Build Image
|
||||
@@ -56,7 +117,9 @@ jobs:
|
||||
docker-images: false
|
||||
swap-storage: true
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
uses: docker/setup-qemu-action@v3.0.0
|
||||
with:
|
||||
image: tonistiigi/binfmt:latest
|
||||
- name: Login to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
|
||||
@@ -40,6 +40,10 @@ class CodeActResponseParser(ResponseParser):
|
||||
if action is None:
|
||||
return ''
|
||||
for lang in ['bash', 'ipython', 'browse']:
|
||||
# special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
|
||||
if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
|
||||
action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
|
||||
|
||||
if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
|
||||
action += f'</execute_{lang}>'
|
||||
return action
|
||||
|
||||
@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
|
||||
ENV RUN_AS_OPENHANDS=true
|
||||
# A random number--we need this to be different from the user's UID on the host machine
|
||||
ENV OPENHANDS_USER_ID=42420
|
||||
ENV SANDBOX_API_HOSTNAME=host.docker.internal
|
||||
ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
|
||||
ENV USE_HOST_NETWORK=false
|
||||
ENV WORKSPACE_BASE=/opt/workspace_base
|
||||
ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
|
||||
|
||||
@@ -63,13 +63,13 @@ then your command would be:
|
||||
./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
|
||||
```
|
||||
|
||||
### Run Inference on `RemoteRuntime`
|
||||
### Run Inference on `RemoteRuntime` (experimental)
|
||||
|
||||
This is in limited beta. Contact Xingyao over slack if you want to try this out!
|
||||
|
||||
```bash
|
||||
# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
|
||||
ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
|
||||
ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
|
||||
./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
|
||||
# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
|
||||
```
|
||||
@@ -157,6 +157,24 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
|
||||
- `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
|
||||
- `logs/`: a directory of test logs
|
||||
|
||||
### Run evaluation with `RemoteRuntime` (experimental)
|
||||
|
||||
This is in limited beta. Contact Xingyao over slack if you want to try this out!
|
||||
|
||||
```bash
|
||||
# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
|
||||
ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
|
||||
evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
|
||||
# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
|
||||
```
|
||||
|
||||
To clean-up all existing runtimes that you've already started, run:
|
||||
|
||||
```bash
|
||||
ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
|
||||
```
|
||||
|
||||
|
||||
## Visualize Results
|
||||
|
||||
First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
|
||||
|
||||
377
evaluation/swe_bench/eval_infer.py
Normal file
377
evaluation/swe_bench/eval_infer.py
Normal file
@@ -0,0 +1,377 @@
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
from swebench.harness.grading import get_eval_report
|
||||
from swebench.harness.run_evaluation import (
|
||||
APPLY_PATCH_FAIL,
|
||||
APPLY_PATCH_PASS,
|
||||
)
|
||||
from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
|
||||
from swebench.harness.utils import load_swebench_dataset
|
||||
|
||||
from evaluation.swe_bench.run_infer import get_instance_docker_image
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
)
|
||||
from openhands.core.config import (
|
||||
AppConfig,
|
||||
SandboxConfig,
|
||||
get_parser,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime
|
||||
from openhands.events.action import CmdRunAction
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
|
||||
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
||||
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
|
||||
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
|
||||
|
||||
|
||||
def process_git_patch(patch):
|
||||
if not isinstance(patch, str):
|
||||
return ''
|
||||
|
||||
if not patch.strip():
|
||||
# skip empty patches
|
||||
return ''
|
||||
|
||||
patch = patch.replace('\r\n', '\n')
|
||||
# There might be some weird characters at the beginning of the patch
|
||||
# due to some OpenHands inference command outputs
|
||||
|
||||
# FOR EXAMPLE:
|
||||
# git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
|
||||
# [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
|
||||
# diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
|
||||
# new file mode 100644
|
||||
# index 0000000000..fc13db5948
|
||||
|
||||
# We "find" the first line that starts with "diff" and then we remove lines before it
|
||||
lines = patch.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith('diff --git'):
|
||||
patch = '\n'.join(lines[i:])
|
||||
break
|
||||
|
||||
patch = patch.rstrip() + '\n' # Make sure the last line ends with a newline
|
||||
return patch
|
||||
|
||||
|
||||
def get_config(instance: pd.Series) -> AppConfig:
|
||||
# We use a different instance image for the each instance of swe-bench eval
|
||||
base_container_image = get_instance_docker_image(instance['instance_id'])
|
||||
logger.info(
|
||||
f'Using instance container image: {base_container_image}. '
|
||||
f'Please make sure this image exists. '
|
||||
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
||||
)
|
||||
config = AppConfig(
|
||||
run_as_openhands=False,
|
||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image=base_container_image,
|
||||
use_host_network=False,
|
||||
# large enough timeout, since some testcases take very long to run
|
||||
timeout=1800,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata | None = None,
|
||||
reset_logger: bool = True,
|
||||
) -> EvalOutput:
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
global output_file
|
||||
log_dir = output_file.replace('.jsonl', '.logs')
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
||||
|
||||
config = get_config(instance)
|
||||
instance_id = instance.instance_id
|
||||
model_patch = instance['model_patch']
|
||||
test_spec: TestSpec = instance['test_spec']
|
||||
logger.info(f'Starting evaluation for instance {instance_id}.')
|
||||
|
||||
if 'test_result' not in instance.keys():
|
||||
instance['test_result'] = {}
|
||||
instance['test_result']['report'] = {
|
||||
'empty_generation': False,
|
||||
'resolved': False,
|
||||
'failed_apply_patch': False,
|
||||
'error_eval': False,
|
||||
'test_timeout': False,
|
||||
}
|
||||
|
||||
if model_patch == '':
|
||||
instance['test_result']['report']['empty_generation'] = True
|
||||
return EvalOutput(
|
||||
instance_id=instance_id,
|
||||
test_result=instance['test_result'],
|
||||
)
|
||||
|
||||
runtime = create_runtime(config, sid=instance_id)
|
||||
|
||||
# Get patch and save it to /tmp/patch.diff
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Patch file
|
||||
patch_file_path = os.path.join(temp_dir, 'patch.diff')
|
||||
with open(patch_file_path, 'w') as f:
|
||||
f.write(model_patch)
|
||||
runtime.copy_to(patch_file_path, '/tmp')
|
||||
# Eval script
|
||||
eval_script_path = os.path.join(temp_dir, 'eval.sh')
|
||||
with open(eval_script_path, 'w') as f:
|
||||
f.write(test_spec.eval_script)
|
||||
runtime.copy_to(eval_script_path, '/tmp')
|
||||
|
||||
# Set +x
|
||||
action = CmdRunAction(command='chmod +x /tmp/eval.sh')
|
||||
action.timeout = 600
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
# Apply patch
|
||||
exec_command = (
|
||||
'cd /testbed && '
|
||||
"(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
|
||||
"(echo 'Failed to apply patch with git apply, trying with patch command...' && "
|
||||
"(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
|
||||
"echo 'APPLY_PATCH_FAIL')))"
|
||||
)
|
||||
action = CmdRunAction(command=exec_command, keep_prompt=False)
|
||||
action.timeout = 600
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
apply_patch_output = obs.content
|
||||
assert isinstance(apply_patch_output, str)
|
||||
instance['test_result']['apply_patch_output'] = apply_patch_output
|
||||
|
||||
try:
|
||||
if 'APPLY_PATCH_FAIL' in apply_patch_output:
|
||||
logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
|
||||
instance['test_result']['report']['failed_apply_patch'] = True
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance_id,
|
||||
test_result=instance['test_result'],
|
||||
)
|
||||
elif 'APPLY_PATCH_PASS' in apply_patch_output:
|
||||
logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
|
||||
|
||||
# Run eval script in background and save output to log file
|
||||
log_file = '/tmp/eval_output.log'
|
||||
action = CmdRunAction(
|
||||
command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
|
||||
)
|
||||
action.timeout = 60 # Short timeout just to get the process ID
|
||||
obs = runtime.run_action(action)
|
||||
|
||||
if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
|
||||
pid = obs.content.split()[-1].strip()
|
||||
logger.info(
|
||||
f'[{instance_id}] Evaluation process started with PID: {pid}'
|
||||
)
|
||||
|
||||
# Poll for completion
|
||||
start_time = time.time()
|
||||
timeout = 1800 # 30 minutes
|
||||
while True:
|
||||
seconds_elapsed = time.time() - start_time
|
||||
if seconds_elapsed > timeout:
|
||||
logger.info(
|
||||
f'[{instance_id}] Evaluation timed out after {timeout} seconds'
|
||||
)
|
||||
instance['test_result']['report']['test_timeout'] = True
|
||||
break
|
||||
check_action = CmdRunAction(
|
||||
command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
|
||||
)
|
||||
check_action.timeout = 60
|
||||
check_obs = runtime.run_action(check_action)
|
||||
if (
|
||||
isinstance(check_obs, CmdOutputObservation)
|
||||
and check_obs.content.split()[-1].strip() == '1'
|
||||
):
|
||||
logger.info(
|
||||
f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
|
||||
)
|
||||
break
|
||||
logger.info(
|
||||
f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
|
||||
)
|
||||
time.sleep(30) # Wait for 30 seconds before checking again
|
||||
|
||||
# Read the log file
|
||||
cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
|
||||
cat_action.timeout = 300
|
||||
cat_obs = runtime.run_action(cat_action)
|
||||
|
||||
# Grade answer
|
||||
if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
|
||||
test_output = cat_obs.content
|
||||
assert isinstance(test_output, str)
|
||||
instance['test_result']['test_output'] = test_output
|
||||
|
||||
# Get report from test output
|
||||
logger.info(f'[{instance_id}] Grading answer...')
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Create a directory structure that matches the expected format
|
||||
# NOTE: this is a hack to make the eval report format consistent
|
||||
# with the original SWE-Bench eval script
|
||||
log_dir = os.path.join(temp_dir, 'logs', instance_id)
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
test_output_path = os.path.join(log_dir, 'test_output.txt')
|
||||
with open(test_output_path, 'w') as f:
|
||||
f.write(test_output)
|
||||
|
||||
_report = get_eval_report(
|
||||
test_spec=test_spec,
|
||||
prediction={
|
||||
'model_patch': model_patch,
|
||||
'instance_id': instance_id,
|
||||
},
|
||||
log_path=test_output_path,
|
||||
include_tests_status=True,
|
||||
)
|
||||
report = _report[instance_id]
|
||||
logger.info(
|
||||
f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
|
||||
)
|
||||
instance['test_result']['report']['resolved'] = report[
|
||||
'resolved'
|
||||
]
|
||||
else:
|
||||
logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
|
||||
instance['test_result']['report']['error_eval'] = True
|
||||
|
||||
return EvalOutput(
|
||||
instance_id=instance_id,
|
||||
test_result=instance['test_result'],
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
|
||||
)
|
||||
raise RuntimeError(
|
||||
instance_id,
|
||||
f'Unexpected output when applying patch:\n{apply_patch_output}',
|
||||
logger,
|
||||
)
|
||||
finally:
|
||||
runtime.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
'--input-file',
|
||||
type=str,
|
||||
help='Path to input predictions file',
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
default='princeton-nlp/SWE-bench',
|
||||
help='data set to evaluate on, either full-test or lite-test',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--split',
|
||||
type=str,
|
||||
default='test',
|
||||
help='split to evaluate on',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
# Load SWE-Bench dataset
|
||||
full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
|
||||
args.dataset, args.split
|
||||
)
|
||||
instance_id_to_instance = {
|
||||
instance['instance_id']: instance for instance in full_dataset
|
||||
}
|
||||
logger.info(
|
||||
f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
|
||||
)
|
||||
|
||||
# Load predictions
|
||||
assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
|
||||
predictions = pd.read_json(args.input_file, lines=True)
|
||||
assert (
|
||||
'instance_id' in predictions.columns
|
||||
), 'Input file must contain instance_id column.'
|
||||
|
||||
if 'model_patch' not in predictions.columns and (
|
||||
'test_result' in predictions.columns
|
||||
and 'model_patch' in predictions['test_result'].iloc[0]
|
||||
):
|
||||
raise ValueError(
|
||||
'Input file must contain model_patch column OR test_result column with model_patch field.'
|
||||
)
|
||||
assert len(predictions['instance_id'].unique()) == len(
|
||||
predictions
|
||||
), 'instance_id column must be unique.'
|
||||
|
||||
if 'model_patch' not in predictions.columns:
|
||||
predictions['model_patch'] = predictions['test_result'].apply(
|
||||
lambda x: x['git_patch']
|
||||
)
|
||||
assert {'instance_id', 'model_patch'}.issubset(
|
||||
set(predictions.columns)
|
||||
), 'Input file must contain instance_id and model_patch columns.'
|
||||
|
||||
# Process model_patch
|
||||
predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
|
||||
|
||||
# Merge predictions with dataset
|
||||
predictions['instance'] = predictions['instance_id'].apply(
|
||||
lambda x: instance_id_to_instance[x]
|
||||
)
|
||||
predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
|
||||
|
||||
# Prepare dataset
|
||||
output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
|
||||
instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
|
||||
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata=None,
|
||||
output_file=output_file,
|
||||
num_workers=args.eval_num_workers,
|
||||
process_instance_func=process_instance,
|
||||
)
|
||||
|
||||
# Load evaluated predictions & print number of resolved predictions
|
||||
evaluated_predictions = pd.read_json(output_file, lines=True)
|
||||
fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
|
||||
|
||||
def count_report_field(row, field):
|
||||
return row['test_result']['report'][field]
|
||||
|
||||
for field in fields:
|
||||
count = evaluated_predictions.apply(
|
||||
count_report_field, args=(field,), axis=1
|
||||
).sum()
|
||||
logger.info(
|
||||
f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
|
||||
)
|
||||
@@ -130,6 +130,7 @@ def get_config(
|
||||
# large enough timeout, since some testcases take very long to run
|
||||
timeout=300,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
|
||||
18
evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
Normal file → Executable file
18
evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
Normal file → Executable file
@@ -2,20 +2,26 @@
|
||||
|
||||
|
||||
# API base URL
|
||||
BASE_URL="https://api.all-hands.dev/v0"
|
||||
BASE_URL="https://runtime.eval.all-hands.dev"
|
||||
|
||||
# Get the list of runtimes
|
||||
runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
|
||||
--header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')
|
||||
response=$(curl --silent --location --request GET "${BASE_URL}/list" \
|
||||
--header "X-API-Key: ${ALLHANDS_API_KEY}")
|
||||
|
||||
n_runtimes=$(echo $response | jq -r '.total')
|
||||
echo "Found ${n_runtimes} runtimes. Stopping them..."
|
||||
|
||||
runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
|
||||
# Loop through each runtime and stop it
|
||||
for runtime_id in $runtimes; do
|
||||
echo "Stopping runtime: ${runtime_id}"
|
||||
curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
|
||||
counter=1
|
||||
for runtime_id in $runtime_ids; do
|
||||
echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
|
||||
curl --silent --location --request POST "${BASE_URL}/stop" \
|
||||
--header "X-API-Key: ${ALLHANDS_API_KEY}" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "{\"runtime_id\": \"${runtime_id}\"}"
|
||||
echo
|
||||
((counter++))
|
||||
done
|
||||
|
||||
echo "All runtimes have been stopped."
|
||||
|
||||
@@ -3,6 +3,8 @@ import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from evaluation.swe_bench.eval_infer import process_git_patch
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('oh_output_file', type=str)
|
||||
args = parser.parse_args()
|
||||
@@ -14,36 +16,6 @@ oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
|
||||
model_name = os.path.basename(os.path.dirname(args.oh_output_file))
|
||||
|
||||
|
||||
def process_git_patch(patch):
|
||||
if not isinstance(patch, str):
|
||||
return ''
|
||||
|
||||
if not patch.strip():
|
||||
# skip empty patches
|
||||
return ''
|
||||
|
||||
patch = patch.replace('\r\n', '\n')
|
||||
# There might be some weird characters at the beginning of the patch
|
||||
# due to some OpenHands inference command outputs
|
||||
|
||||
# FOR EXAMPLE:
|
||||
# git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
|
||||
# [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
|
||||
# diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
|
||||
# new file mode 100644
|
||||
# index 0000000000..fc13db5948
|
||||
|
||||
# We "find" the first line that starts with "diff" and then we remove lines before it
|
||||
lines = patch.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith('diff --git'):
|
||||
patch = '\n'.join(lines[i:])
|
||||
break
|
||||
|
||||
patch = patch.rstrip() + '\n' # Make sure the last line ends with a newline
|
||||
return patch
|
||||
|
||||
|
||||
def convert_row_to_swebench_format(row):
|
||||
if 'git_patch' in row:
|
||||
model_patch = row['git_patch']
|
||||
|
||||
27
evaluation/swe_bench/scripts/eval/download_gold_patch.py
Normal file
27
evaluation/swe_bench/scripts/eval/download_gold_patch.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('output_filepath', type=str, help='Path to save the output file')
|
||||
parser.add_argument(
|
||||
'--dataset_name',
|
||||
type=str,
|
||||
help='Name of the dataset to download',
|
||||
default='princeton-nlp/SWE-bench_Lite',
|
||||
)
|
||||
parser.add_argument('--split', type=str, help='Split to download', default='test')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset = load_dataset(args.dataset_name, split=args.split)
|
||||
output_filepath = args.output_filepath
|
||||
print(
|
||||
f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
|
||||
)
|
||||
patches = [
|
||||
{'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
|
||||
]
|
||||
print(f'{len(patches)} gold patches loaded')
|
||||
pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
|
||||
print(f'Patches saved to {output_filepath}')
|
||||
@@ -98,6 +98,8 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
|
||||
RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
|
||||
echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
|
||||
RESULT_OUTPUT_DIR_NAME=$(basename $RESULT_OUTPUT_DIR)
|
||||
echo "RESULT_OUTPUT_DIR_NAME: $RESULT_OUTPUT_DIR_NAME"
|
||||
|
||||
# move the eval results to the target directory
|
||||
mkdir -p $RESULT_OUTPUT_DIR
|
||||
@@ -106,7 +108,7 @@ if [ -z "$INSTANCE_ID" ]; then
|
||||
rm -rf $RESULT_OUTPUT_DIR/eval_outputs
|
||||
fi
|
||||
|
||||
mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
|
||||
mv logs/run_evaluation/$RUN_ID/$RESULT_OUTPUT_DIR_NAME $RESULT_OUTPUT_DIR
|
||||
mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
|
||||
echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt
|
||||
|
||||
|
||||
43
evaluation/swe_bench/scripts/eval_infer_remote.sh
Executable file
43
evaluation/swe_bench/scripts/eval_infer_remote.sh
Executable file
@@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
INPUT_FILE=$1
|
||||
NUM_WORKERS=$2
|
||||
DATASET=$3
|
||||
SPLIT=$4
|
||||
|
||||
if [ -z "$INPUT_FILE" ]; then
|
||||
echo "INPUT_FILE not specified (should be a path to a jsonl file)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
|
||||
DATASET="princeton-nlp/SWE-bench_Lite"
|
||||
fi
|
||||
|
||||
if [ -z "$SPLIT" ]; then
|
||||
echo "SPLIT not specified, use default test"
|
||||
SPLIT="test"
|
||||
fi
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
echo "NUM_WORKERS not specified, use default 1"
|
||||
NUM_WORKERS=1
|
||||
fi
|
||||
|
||||
echo "... Evaluating on $INPUT_FILE ..."
|
||||
|
||||
COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--input-file $INPUT_FILE \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
@@ -6,7 +6,6 @@ import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from typing import Any, Awaitable, Callable, TextIO
|
||||
|
||||
import pandas as pd
|
||||
@@ -50,15 +49,16 @@ class EvalMetadata(BaseModel):
|
||||
class EvalOutput(BaseModel):
|
||||
# NOTE: User-specified
|
||||
instance_id: str
|
||||
instruction: str
|
||||
# output of the evaluation
|
||||
# store anything that is needed for the score calculation
|
||||
test_result: dict[str, Any]
|
||||
|
||||
instruction: str | None = None
|
||||
|
||||
# Interaction info
|
||||
metadata: EvalMetadata
|
||||
history: list[tuple[dict[str, Any], dict[str, Any]]]
|
||||
metrics: dict[str, Any]
|
||||
metadata: EvalMetadata | None = None
|
||||
history: list[tuple[dict[str, Any], dict[str, Any]]] | None = None
|
||||
metrics: dict[str, Any] | None = None
|
||||
error: str | None = None
|
||||
|
||||
# Optionally save the input test instance
|
||||
@@ -66,15 +66,19 @@ class EvalOutput(BaseModel):
|
||||
|
||||
def model_dump(self, *args, **kwargs):
|
||||
dumped_dict = super().model_dump(*args, **kwargs)
|
||||
# Remove None values
|
||||
dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
|
||||
# Apply custom serialization for metadata (to avoid leaking sensitive information)
|
||||
dumped_dict['metadata'] = self.metadata.model_dump()
|
||||
if self.metadata is not None:
|
||||
dumped_dict['metadata'] = self.metadata.model_dump()
|
||||
return dumped_dict
|
||||
|
||||
def model_dump_json(self, *args, **kwargs):
|
||||
dumped = super().model_dump_json(*args, **kwargs)
|
||||
dumped_dict = json.loads(dumped)
|
||||
# Apply custom serialization for metadata (to avoid leaking sensitive information)
|
||||
dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
|
||||
if 'metadata' in dumped_dict:
|
||||
dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
|
||||
return json.dumps(dumped_dict)
|
||||
|
||||
|
||||
@@ -260,32 +264,46 @@ def _process_instance_wrapper(
|
||||
result = process_instance_func(instance, metadata, use_mp)
|
||||
return result
|
||||
except Exception as e:
|
||||
error = str(e)
|
||||
stacktrace = traceback.format_exc()
|
||||
if attempt == max_retries:
|
||||
logger.exception(e)
|
||||
msg = (
|
||||
'-' * 10
|
||||
+ '\n'
|
||||
+ f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
|
||||
+ '\n'
|
||||
+ f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
|
||||
+ '-' * 10
|
||||
)
|
||||
# Raise an error after all retries & stop the evaluation
|
||||
raise RuntimeError(
|
||||
f'Maximum error retries reached for instance {instance.instance_id}'
|
||||
) from e
|
||||
error = str(e)
|
||||
stacktrace = traceback.format_exc()
|
||||
msg = (
|
||||
'-' * 10
|
||||
+ '\n'
|
||||
+ f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
|
||||
+ '\n'
|
||||
+ '-' * 10
|
||||
+ '[This error occurred after maximum retries]'
|
||||
+ f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
|
||||
+ '-' * 10
|
||||
+ '\n'
|
||||
)
|
||||
logger.error(msg)
|
||||
if use_mp:
|
||||
print(msg) # use print to directly print to console
|
||||
time.sleep(1) # Add a small delay before retrying
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def _process_instance_wrapper_mp(args):
|
||||
"""Wrapper for multiprocessing, especially for imap_unordered."""
|
||||
return _process_instance_wrapper(*args)
|
||||
|
||||
|
||||
def run_evaluation(
|
||||
dataset: pd.DataFrame,
|
||||
metadata: EvalMetadata,
|
||||
metadata: EvalMetadata | None,
|
||||
output_file: str,
|
||||
num_workers: int,
|
||||
process_instance_func: Callable[
|
||||
@@ -294,10 +312,14 @@ def run_evaluation(
|
||||
max_retries: int = 5, # number of retries for each instance
|
||||
):
|
||||
use_multiprocessing = num_workers > 1
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {metadata.agent_class}:\n'
|
||||
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
|
||||
)
|
||||
|
||||
if metadata is not None:
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {metadata.agent_class}:\n'
|
||||
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
|
||||
)
|
||||
else:
|
||||
logger.info(f'Evaluation started with {num_workers} workers.')
|
||||
|
||||
total_instances = len(dataset)
|
||||
pbar = tqdm(total=total_instances, desc='Instances processed')
|
||||
@@ -305,20 +327,13 @@ def run_evaluation(
|
||||
|
||||
try:
|
||||
if use_multiprocessing:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = [
|
||||
executor.submit(
|
||||
_process_instance_wrapper,
|
||||
process_instance_func=process_instance_func,
|
||||
instance=instance,
|
||||
metadata=metadata,
|
||||
use_mp=True,
|
||||
max_retries=max_retries,
|
||||
)
|
||||
with mp.Pool(num_workers) as pool:
|
||||
args_iter = (
|
||||
(process_instance_func, instance, metadata, True, max_retries)
|
||||
for _, instance in dataset.iterrows()
|
||||
]
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
)
|
||||
results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
|
||||
for result in results:
|
||||
update_progress(result, pbar, output_fp)
|
||||
else:
|
||||
for _, instance in dataset.iterrows():
|
||||
@@ -355,18 +370,27 @@ def reset_logger_for_multiprocessing(
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
|
||||
# add console handler to print ONE line
|
||||
console_handler = get_console_handler(log_level=logging.INFO)
|
||||
console_handler.setFormatter(
|
||||
logging.Formatter(
|
||||
f'Instance {instance_id} - ' + '%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
)
|
||||
logger.addHandler(console_handler)
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {instance_id}.\n'
|
||||
f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# Only log WARNING or higher to console
|
||||
console_handler.setLevel(logging.WARNING)
|
||||
|
||||
# Log INFO and above to file
|
||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
@@ -183,7 +183,8 @@ class SandboxConfig:
|
||||
"""Configuration for the sandbox.
|
||||
|
||||
Attributes:
|
||||
api_hostname: The hostname for the EventStream Runtime API.
|
||||
remote_runtime_api_url: The hostname for the Remote Runtime API.
|
||||
local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments
|
||||
base_container_image: The base container image from which to build the runtime image.
|
||||
runtime_container_image: The runtime container image to use.
|
||||
user_id: The user ID for the sandbox.
|
||||
@@ -204,7 +205,8 @@ class SandboxConfig:
|
||||
Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
|
||||
"""
|
||||
|
||||
api_hostname: str = 'localhost'
|
||||
remote_runtime_api_url: str = 'http://localhost:8000'
|
||||
local_runtime_url: str = 'http://localhost'
|
||||
api_key: str | None = None
|
||||
base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22' # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
|
||||
runtime_container_image: str | None = None
|
||||
@@ -755,6 +757,18 @@ def get_parser() -> argparse.ArgumentParser:
|
||||
type=str,
|
||||
help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
|
||||
)
|
||||
# Map-reduce arguments for evaluation
|
||||
parser.add_argument(
|
||||
'--eval-map-reduce-write-inputs',
|
||||
action='store_true',
|
||||
help='write inputs to output_dir/mr_inputs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--eval-map-reduce-read-input-file',
|
||||
type=str,
|
||||
default=None,
|
||||
help='read input (arguments for process_instance) from this file, run it, and write output to output_dir/mr_outputs',
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from litellm import completion as litellm_completion
|
||||
from litellm import completion_cost as litellm_completion_cost
|
||||
from litellm.exceptions import (
|
||||
APIConnectionError,
|
||||
APIError,
|
||||
ContentPolicyViolationError,
|
||||
InternalServerError,
|
||||
NotFoundError,
|
||||
@@ -37,6 +38,14 @@ from openhands.core.metrics import Metrics
|
||||
__all__ = ['LLM']
|
||||
|
||||
message_separator = '\n\n----------\n\n'
|
||||
# tuple of exceptions to retry on
|
||||
LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
|
||||
APIConnectionError,
|
||||
APIError,
|
||||
InternalServerError,
|
||||
RateLimitError,
|
||||
ServiceUnavailableError,
|
||||
)
|
||||
|
||||
cache_prompting_supported_models = [
|
||||
'claude-3-5-sonnet-20240620',
|
||||
|
||||
@@ -36,14 +36,12 @@ class DockerRuntimeBuilder(RuntimeBuilder):
|
||||
|
||||
logger.info(f'Image [{target_image_hash_name}] build finished.')
|
||||
|
||||
assert (
|
||||
target_image_tag
|
||||
), f'Expected target image tag [{target_image_tag}] is None'
|
||||
image = self.docker_client.images.get(target_image_hash_name)
|
||||
image.tag(target_image_repo, target_image_tag)
|
||||
logger.info(
|
||||
f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
|
||||
)
|
||||
if target_image_tag:
|
||||
image = self.docker_client.images.get(target_image_hash_name)
|
||||
image.tag(target_image_repo, target_image_tag)
|
||||
logger.info(
|
||||
f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
|
||||
)
|
||||
|
||||
# Check if the image is built successfully
|
||||
image = self.docker_client.images.get(target_image_hash_name)
|
||||
|
||||
@@ -124,9 +124,7 @@ class EventStreamRuntime(Runtime):
|
||||
self.config = config
|
||||
self._host_port = 30000 # initial dummy value
|
||||
self._container_port = 30001 # initial dummy value
|
||||
self.api_url = (
|
||||
f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
|
||||
)
|
||||
self.api_url = f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
|
||||
self.session = requests.Session()
|
||||
self.instance_id = (
|
||||
sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
|
||||
@@ -212,7 +210,7 @@ class EventStreamRuntime(Runtime):
|
||||
self._host_port
|
||||
) # in future this might differ from host port
|
||||
self.api_url = (
|
||||
f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
|
||||
f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
|
||||
)
|
||||
|
||||
use_host_network = self.config.sandbox.use_host_network
|
||||
@@ -419,7 +417,8 @@ class EventStreamRuntime(Runtime):
|
||||
response = self.session.post(
|
||||
f'{self.api_url}/execute_action',
|
||||
json={'action': event_to_dict(action)},
|
||||
timeout=action.timeout,
|
||||
# wait a few more seconds to get the timeout error from client side
|
||||
timeout=action.timeout + 5,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
output = response.json()
|
||||
|
||||
@@ -57,13 +57,6 @@ class RemoteRuntime(Runtime):
|
||||
env_vars: dict[str, str] | None = None,
|
||||
):
|
||||
self.config = config
|
||||
if self.config.sandbox.api_hostname == 'localhost':
|
||||
self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
|
||||
logger.warning(
|
||||
'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
|
||||
'Setting it to default value: api.all-hands.dev/v0/runtime'
|
||||
)
|
||||
self.api_url = f'https://{self.config.sandbox.api_hostname.rstrip("/")}'
|
||||
|
||||
if self.config.sandbox.api_key is None:
|
||||
raise ValueError(
|
||||
@@ -80,7 +73,7 @@ class RemoteRuntime(Runtime):
|
||||
)
|
||||
|
||||
self.runtime_builder = RemoteRuntimeBuilder(
|
||||
self.api_url, self.config.sandbox.api_key
|
||||
self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key
|
||||
)
|
||||
self.runtime_id: str | None = None
|
||||
self.runtime_url: str | None = None
|
||||
@@ -95,7 +88,11 @@ class RemoteRuntime(Runtime):
|
||||
self.container_image: str = self.config.sandbox.base_container_image
|
||||
self.container_name = 'oh-remote-runtime-' + self.instance_id
|
||||
logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
|
||||
response = send_request(self.session, 'GET', f'{self.api_url}/registry_prefix')
|
||||
response = send_request(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
|
||||
)
|
||||
response_json = response.json()
|
||||
registry_prefix = response_json['registry_prefix']
|
||||
os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = (
|
||||
@@ -121,7 +118,7 @@ class RemoteRuntime(Runtime):
|
||||
response = send_request(
|
||||
self.session,
|
||||
'GET',
|
||||
f'{self.api_url}/image_exists',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
|
||||
params={'image': self.container_image},
|
||||
)
|
||||
if response.status_code != 200 or not response.json()['exists']:
|
||||
@@ -155,7 +152,10 @@ class RemoteRuntime(Runtime):
|
||||
|
||||
# Start the sandbox using the /start endpoint
|
||||
response = send_request(
|
||||
self.session, 'POST', f'{self.api_url}/start', json=start_request
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/start',
|
||||
json=start_request,
|
||||
)
|
||||
if response.status_code != 201:
|
||||
raise RuntimeError(f'Failed to start sandbox: {response.text}')
|
||||
@@ -197,6 +197,8 @@ class RemoteRuntime(Runtime):
|
||||
# because the runtime might just be starting up
|
||||
# and have not registered the endpoint yet
|
||||
retry_fns=[is_404_error],
|
||||
# leave enough time for the runtime to start up
|
||||
timeout=600,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
msg = f'Runtime is not alive yet (id={self.runtime_id}). Status: {response.status_code}.'
|
||||
@@ -209,7 +211,7 @@ class RemoteRuntime(Runtime):
|
||||
response = send_request(
|
||||
self.session,
|
||||
'POST',
|
||||
f'{self.api_url}/stop',
|
||||
f'{self.config.sandbox.remote_runtime_api_url}/stop',
|
||||
json={'runtime_id': self.runtime_id},
|
||||
)
|
||||
if response.status_code != 200:
|
||||
@@ -248,7 +250,8 @@ class RemoteRuntime(Runtime):
|
||||
'POST',
|
||||
f'{self.runtime_url}/execute_action',
|
||||
json=request_body,
|
||||
timeout=action.timeout,
|
||||
# wait a few more seconds to get the timeout error from client side
|
||||
timeout=action.timeout + 5,
|
||||
retry_exceptions=list(
|
||||
filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
|
||||
),
|
||||
|
||||
@@ -370,10 +370,16 @@ def _build_sandbox_image(
|
||||
target_image_hash_name = f'{target_image_repo}:{target_image_hash_tag}'
|
||||
target_image_generic_name = f'{target_image_repo}:{target_image_tag}'
|
||||
|
||||
tags_to_add = [target_image_hash_name]
|
||||
|
||||
# Only add the generic tag if the image does not exist
|
||||
# so it does not get overwritten & only points to the earliest version
|
||||
# to avoid "too many layers" after many re-builds
|
||||
if not runtime_builder.image_exists(target_image_generic_name):
|
||||
tags_to_add.append(target_image_generic_name)
|
||||
|
||||
try:
|
||||
image_name = runtime_builder.build(
|
||||
path=docker_folder, tags=[target_image_hash_name, target_image_generic_name]
|
||||
)
|
||||
image_name = runtime_builder.build(path=docker_folder, tags=tags_to_add)
|
||||
if not image_name:
|
||||
raise RuntimeError(f'Build failed for image {target_image_hash_name}')
|
||||
except Exception as e:
|
||||
|
||||
@@ -66,8 +66,7 @@ RUN \
|
||||
/openhands/miniforge3/bin/mamba run -n base poetry run pip install playwright && \
|
||||
/openhands/miniforge3/bin/mamba run -n base poetry run playwright install --with-deps chromium && \
|
||||
# Set environment variables
|
||||
export OH_INTERPRETER_PATH=$(/openhands/miniforge3/bin/mamba run -n base poetry run python -c "import sys; print(sys.executable)") && \
|
||||
export OH_VENV_PATH=$(/openhands/miniforge3/bin/mamba run -n base poetry env info --path) && \
|
||||
echo "OH_INTERPRETER_PATH=$(/openhands/miniforge3/bin/mamba run -n base poetry run python -c "import sys; print(sys.executable)")" >> /etc/environment && \
|
||||
# Install extra dependencies if specified
|
||||
{{ extra_deps }} {% if extra_deps %} && {% endif %} \
|
||||
# Clear caches
|
||||
@@ -78,16 +77,6 @@ RUN \
|
||||
# Clean up
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
/openhands/miniforge3/bin/mamba clean --all
|
||||
{% if not skip_init %}
|
||||
RUN \
|
||||
# Add the Poetry virtual environment to the bashrc
|
||||
echo "export OH_INTERPRETER_PATH=\"$OH_INTERPRETER_PATH\"" >> /etc/bash.bashrc && \
|
||||
echo "export OH_VENV_PATH=\"$OH_VENV_PATH\"" >> /etc/bash.bashrc && \
|
||||
# Activate the Poetry virtual environment
|
||||
echo 'source "$OH_VENV_PATH/bin/activate"' >> /etc/bash.bashrc && \
|
||||
# Use the Poetry virtual environment's Python interpreter
|
||||
echo 'alias python="$OH_INTERPRETER_PATH"' >> /etc/bash.bashrc
|
||||
{% endif %}
|
||||
# ================================================================
|
||||
# END: Copy Project and Install/Update Dependencies
|
||||
# ================================================================
|
||||
|
||||
8
poetry.lock
generated
8
poetry.lock
generated
@@ -3761,13 +3761,13 @@ types-tqdm = "*"
|
||||
|
||||
[[package]]
|
||||
name = "litellm"
|
||||
version = "1.46.1"
|
||||
version = "1.48.9"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
optional = false
|
||||
python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
|
||||
files = [
|
||||
{file = "litellm-1.46.1-py3-none-any.whl", hash = "sha256:f6b78278cf21a38da0d10a8b3e7b1084b6410012552c0a413774d1c43706e5ba"},
|
||||
{file = "litellm-1.46.1.tar.gz", hash = "sha256:993c23d6f5e1d0f070b250d858a6ee87750a032e38f460f8c82385be854bc45f"},
|
||||
{file = "litellm-1.48.9-py3-none-any.whl", hash = "sha256:9608f510e82c27b15bab7bcfab5e1308055f0c457e7881ccfff91c189bf2c055"},
|
||||
{file = "litellm-1.48.9.tar.gz", hash = "sha256:02dd2f66fab24f388692694401bbabd34de5a62a16d064b3f15726a550a65cd3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -9675,4 +9675,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "5acb0e1ac5538c10add8f72b0f5c2762bea1a08cce7548deccd263934f043cfb"
|
||||
content-hash = "96a302abea5291a44d97c2e4c813a8db2e6f3b1327b1c4f7dbf6d00eb8e19560"
|
||||
|
||||
@@ -16,7 +16,7 @@ packages = [
|
||||
python = "^3.11"
|
||||
datasets = "*"
|
||||
pandas = "*"
|
||||
litellm = "*"
|
||||
litellm = "^1.48.6"
|
||||
google-generativeai = "*" # To use litellm with Gemini Pro API
|
||||
termcolor = "*"
|
||||
seaborn = "*"
|
||||
|
||||
@@ -57,6 +57,31 @@ def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_bash_timeout_and_keyboard_interrupt(temp_dir, box_class, run_as_openhands):
|
||||
runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
|
||||
try:
|
||||
action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert (
|
||||
'[Command timed out after 1 seconds. SIGINT was sent to interrupt it.]'
|
||||
in obs.content
|
||||
)
|
||||
assert 'KeyboardInterrupt' in obs.content
|
||||
|
||||
# follow up command should not be affected
|
||||
action = CmdRunAction(command='ls')
|
||||
action.timeout = 1
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_multiline_commands(temp_dir, box_class):
|
||||
runtime = _load_runtime(temp_dir, box_class)
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user