Retry on litellm's APIError, which includes 502 (#4167 )

upgrade litellm
fix wrong import
2026-04-29 03:00:45 -04:00 · 2024-10-03 03:00:58 +00:00 · 2024-10-02 19:49:22 +00:00 · 2024-10-02 19:49:12 +00:00 · 2024-10-02 04:08:02 +00:00 · 2024-10-02 01:01:16 +00:00
22 changed files with 703 additions and 123 deletions
--- a/.github/workflows/ghcr_runtime.yml
+++ b/.github/workflows/ghcr_runtime.yml
@@ -26,6 +26,67 @@ on:
        default: ''

 jobs:
+  # Builds the OpenHands Docker images
+  ghcr_build_app:
+    name: Build App Image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false
+          swap-storage: true
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build and push app image
+        if: "!github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh openhands ${{ github.repository_owner }} --push
+      - name: Build app image
+        if: "github.event.pull_request.head.repo.fork"
+        run: |
+          ./containers/build.sh openhands image ${{ github.repository_owner }}
+      - name: Get hash in App Image
+        id: get_hash_in_app_image
+        run: |
+          # Lowercase the repository owner
+          export REPO_OWNER=${{ github.repository_owner }}
+          REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]')
+          # Run the build script in the app image
+          docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ github.sha }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt
+          # Get the hash from the build script
+          hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1)
+          echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT
+          echo "Hash from app image: $hash_from_app_image"
+
+
  # Builds the runtime Docker images
  ghcr_build_runtime:
    name: Build Image
@@ -56,7 +117,9 @@ jobs:
          docker-images: false
          swap-storage: true
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v3.0.0
+        with:
+          image: tonistiigi/binfmt:latest
      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
--- a/agenthub/codeact_agent/action_parser.py
+++ b/agenthub/codeact_agent/action_parser.py
@@ -40,6 +40,10 @@ class CodeActResponseParser(ResponseParser):
        if action is None:
            return ''
        for lang in ['bash', 'ipython', 'browse']:
+            # special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
+                action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
+
            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                action += f'</execute_{lang}>'
        return action
--- a/containers/app/Dockerfile
+++ b/containers/app/Dockerfile
@@ -37,7 +37,7 @@ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
 ENV RUN_AS_OPENHANDS=true
 # A random number--we need this to be different from the user's UID on the host machine
 ENV OPENHANDS_USER_ID=42420
-ENV SANDBOX_API_HOSTNAME=host.docker.internal
+ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -63,13 +63,13 @@ then your command would be:
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```

-### Run Inference on `RemoteRuntime`
+### Run Inference on `RemoteRuntime` (experimental)

 This is in limited beta. Contact Xingyao over slack if you want to try this out!

 ```bash
 # ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
 # This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ```
@@ -157,6 +157,24 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `logs/`: a directory of test logs

+### Run evaluation with `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+```
+
+To clean-up all existing runtimes that you've already started, run:
+
+```bash
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+```
+
+
 ## Visualize Results

 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -0,0 +1,377 @@
+import os
+import tempfile
+import time
+
+import pandas as pd
+from swebench.harness.grading import get_eval_report
+from swebench.harness.run_evaluation import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+)
+from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
+from swebench.harness.utils import load_swebench_dataset
+
+from evaluation.swe_bench.run_infer import get_instance_docker_image
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_parser,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime
+from openhands.events.action import CmdRunAction
+from openhands.events.observation import CmdOutputObservation
+
+# TODO: migrate all swe-bench docker to ghcr.io/openhands
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
+
+
+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
+def get_config(instance: pd.Series) -> AppConfig:
+    # We use a different instance image for the each instance of swe-bench eval
+    base_container_image = get_instance_docker_image(instance['instance_id'])
+    logger.info(
+        f'Using instance container image: {base_container_image}. '
+        f'Please make sure this image exists. '
+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+    )
+    config = AppConfig(
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
+        sandbox=SandboxConfig(
+            base_container_image=base_container_image,
+            use_host_network=False,
+            # large enough timeout, since some testcases take very long to run
+            timeout=1800,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    return config
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata | None = None,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        global output_file
+        log_dir = output_file.replace('.jsonl', '.logs')
+        os.makedirs(log_dir, exist_ok=True)
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    config = get_config(instance)
+    instance_id = instance.instance_id
+    model_patch = instance['model_patch']
+    test_spec: TestSpec = instance['test_spec']
+    logger.info(f'Starting evaluation for instance {instance_id}.')
+
+    if 'test_result' not in instance.keys():
+        instance['test_result'] = {}
+    instance['test_result']['report'] = {
+        'empty_generation': False,
+        'resolved': False,
+        'failed_apply_patch': False,
+        'error_eval': False,
+        'test_timeout': False,
+    }
+
+    if model_patch == '':
+        instance['test_result']['report']['empty_generation'] = True
+        return EvalOutput(
+            instance_id=instance_id,
+            test_result=instance['test_result'],
+        )
+
+    runtime = create_runtime(config, sid=instance_id)
+
+    # Get patch and save it to /tmp/patch.diff
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Patch file
+        patch_file_path = os.path.join(temp_dir, 'patch.diff')
+        with open(patch_file_path, 'w') as f:
+            f.write(model_patch)
+        runtime.copy_to(patch_file_path, '/tmp')
+        # Eval script
+        eval_script_path = os.path.join(temp_dir, 'eval.sh')
+        with open(eval_script_path, 'w') as f:
+            f.write(test_spec.eval_script)
+        runtime.copy_to(eval_script_path, '/tmp')
+
+    # Set +x
+    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    # Apply patch
+    exec_command = (
+        'cd /testbed && '
+        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
+        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+        "echo 'APPLY_PATCH_FAIL')))"
+    )
+    action = CmdRunAction(command=exec_command, keep_prompt=False)
+    action.timeout = 600
+    obs = runtime.run_action(action)
+    assert isinstance(obs, CmdOutputObservation)
+    apply_patch_output = obs.content
+    assert isinstance(apply_patch_output, str)
+    instance['test_result']['apply_patch_output'] = apply_patch_output
+
+    try:
+        if 'APPLY_PATCH_FAIL' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
+            instance['test_result']['report']['failed_apply_patch'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+            )
+        elif 'APPLY_PATCH_PASS' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
+
+            # Run eval script in background and save output to log file
+            log_file = '/tmp/eval_output.log'
+            action = CmdRunAction(
+                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
+            )
+            action.timeout = 60  # Short timeout just to get the process ID
+            obs = runtime.run_action(action)
+
+            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+                pid = obs.content.split()[-1].strip()
+                logger.info(
+                    f'[{instance_id}] Evaluation process started with PID: {pid}'
+                )
+
+                # Poll for completion
+                start_time = time.time()
+                timeout = 1800  # 30 minutes
+                while True:
+                    seconds_elapsed = time.time() - start_time
+                    if seconds_elapsed > timeout:
+                        logger.info(
+                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
+                        )
+                        instance['test_result']['report']['test_timeout'] = True
+                        break
+                    check_action = CmdRunAction(
+                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
+                    )
+                    check_action.timeout = 60
+                    check_obs = runtime.run_action(check_action)
+                    if (
+                        isinstance(check_obs, CmdOutputObservation)
+                        and check_obs.content.split()[-1].strip() == '1'
+                    ):
+                        logger.info(
+                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
+                        )
+                        break
+                    logger.info(
+                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
+                    )
+                    time.sleep(30)  # Wait for 30 seconds before checking again
+
+                # Read the log file
+                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
+                cat_action.timeout = 300
+                cat_obs = runtime.run_action(cat_action)
+
+                # Grade answer
+                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
+                    test_output = cat_obs.content
+                    assert isinstance(test_output, str)
+                    instance['test_result']['test_output'] = test_output
+
+                    # Get report from test output
+                    logger.info(f'[{instance_id}] Grading answer...')
+                    with tempfile.TemporaryDirectory() as temp_dir:
+                        # Create a directory structure that matches the expected format
+                        # NOTE: this is a hack to make the eval report format consistent
+                        # with the original SWE-Bench eval script
+                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
+                        os.makedirs(log_dir, exist_ok=True)
+                        test_output_path = os.path.join(log_dir, 'test_output.txt')
+                        with open(test_output_path, 'w') as f:
+                            f.write(test_output)
+
+                        _report = get_eval_report(
+                            test_spec=test_spec,
+                            prediction={
+                                'model_patch': model_patch,
+                                'instance_id': instance_id,
+                            },
+                            log_path=test_output_path,
+                            include_tests_status=True,
+                        )
+                        report = _report[instance_id]
+                        logger.info(
+                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                        )
+                        instance['test_result']['report']['resolved'] = report[
+                            'resolved'
+                        ]
+            else:
+                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
+                instance['test_result']['report']['error_eval'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+            )
+        else:
+            logger.info(
+                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
+            )
+            raise RuntimeError(
+                instance_id,
+                f'Unexpected output when applying patch:\n{apply_patch_output}',
+                logger,
+            )
+    finally:
+        runtime.close()
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--input-file',
+        type=str,
+        help='Path to input predictions file',
+        required=True,
+    )
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='princeton-nlp/SWE-bench',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    args, _ = parser.parse_known_args()
+
+    # Load SWE-Bench dataset
+    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
+        args.dataset, args.split
+    )
+    instance_id_to_instance = {
+        instance['instance_id']: instance for instance in full_dataset
+    }
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
+    )
+
+    # Load predictions
+    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
+    predictions = pd.read_json(args.input_file, lines=True)
+    assert (
+        'instance_id' in predictions.columns
+    ), 'Input file must contain instance_id column.'
+
+    if 'model_patch' not in predictions.columns and (
+        'test_result' in predictions.columns
+        and 'model_patch' in predictions['test_result'].iloc[0]
+    ):
+        raise ValueError(
+            'Input file must contain model_patch column OR test_result column with model_patch field.'
+        )
+    assert len(predictions['instance_id'].unique()) == len(
+        predictions
+    ), 'instance_id column must be unique.'
+
+    if 'model_patch' not in predictions.columns:
+        predictions['model_patch'] = predictions['test_result'].apply(
+            lambda x: x['git_patch']
+        )
+    assert {'instance_id', 'model_patch'}.issubset(
+        set(predictions.columns)
+    ), 'Input file must contain instance_id and model_patch columns.'
+
+    # Process model_patch
+    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
+
+    # Merge predictions with dataset
+    predictions['instance'] = predictions['instance_id'].apply(
+        lambda x: instance_id_to_instance[x]
+    )
+    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
+
+    # Prepare dataset
+    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
+    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
+
+    run_evaluation(
+        instances,
+        metadata=None,
+        output_file=output_file,
+        num_workers=args.eval_num_workers,
+        process_instance_func=process_instance,
+    )
+
+    # Load evaluated predictions & print number of resolved predictions
+    evaluated_predictions = pd.read_json(output_file, lines=True)
+    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
+
+    def count_report_field(row, field):
+        return row['test_result']['report'][field]
+
+    for field in fields:
+        count = evaluated_predictions.apply(
+            count_report_field, args=(field,), axis=1
+        ).sum()
+        logger.info(
+            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
+        )
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -130,6 +130,7 @@ def get_config(
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
        ),
        # do not mount workspace
        workspace_base=None,
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -2,20 +2,26 @@


 # API base URL
-BASE_URL="https://api.all-hands.dev/v0"
+BASE_URL="https://runtime.eval.all-hands.dev"

 # Get the list of runtimes
-runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
-  --header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')
+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
+  --header "X-API-Key: ${ALLHANDS_API_KEY}")

+n_runtimes=$(echo $response | jq -r '.total')
+echo "Found ${n_runtimes} runtimes. Stopping them..."
+
+runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 # Loop through each runtime and stop it
-for runtime_id in $runtimes; do
-  echo "Stopping runtime: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
+counter=1
+for runtime_id in $runtime_ids; do
+  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
+  curl --silent --location --request POST "${BASE_URL}/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
  echo
+  ((counter++))
 done

 echo "All runtimes have been stopped."
--- a/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
@@ -3,6 +3,8 @@ import os

 import pandas as pd

+from evaluation.swe_bench.eval_infer import process_git_patch
+
 parser = argparse.ArgumentParser()
 parser.add_argument('oh_output_file', type=str)
 args = parser.parse_args()
@@ -14,36 +16,6 @@ oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))


-def process_git_patch(patch):
-    if not isinstance(patch, str):
-        return ''
-
-    if not patch.strip():
-        # skip empty patches
-        return ''
-
-    patch = patch.replace('\r\n', '\n')
-    # There might be some weird characters at the beginning of the patch
-    # due to some OpenHands inference command outputs
-
-    # FOR EXAMPLE:
-    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
-    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
-    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
-    # new file mode 100644
-    # index 0000000000..fc13db5948
-
-    # We "find" the first line that starts with "diff" and then we remove lines before it
-    lines = patch.split('\n')
-    for i, line in enumerate(lines):
-        if line.startswith('diff --git'):
-            patch = '\n'.join(lines[i:])
-            break
-
-    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
-    return patch
-
-
 def convert_row_to_swebench_format(row):
    if 'git_patch' in row:
        model_patch = row['git_patch']
--- a/evaluation/swe_bench/scripts/eval/download_gold_patch.py
+++ b/evaluation/swe_bench/scripts/eval/download_gold_patch.py
@@ -0,0 +1,27 @@
+import argparse
+
+import pandas as pd
+from datasets import load_dataset
+
+parser = argparse.ArgumentParser()
+parser.add_argument('output_filepath', type=str, help='Path to save the output file')
+parser.add_argument(
+    '--dataset_name',
+    type=str,
+    help='Name of the dataset to download',
+    default='princeton-nlp/SWE-bench_Lite',
+)
+parser.add_argument('--split', type=str, help='Split to download', default='test')
+args = parser.parse_args()
+
+dataset = load_dataset(args.dataset_name, split=args.split)
+output_filepath = args.output_filepath
+print(
+    f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
+)
+patches = [
+    {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
+]
+print(f'{len(patches)} gold patches loaded')
+pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
+print(f'Patches saved to {output_filepath}')
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -98,6 +98,8 @@ if [ -z "$INSTANCE_ID" ]; then

    RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
    echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"
+    RESULT_OUTPUT_DIR_NAME=$(basename $RESULT_OUTPUT_DIR)
+    echo "RESULT_OUTPUT_DIR_NAME: $RESULT_OUTPUT_DIR_NAME"

    # move the eval results to the target directory
    mkdir -p $RESULT_OUTPUT_DIR
@@ -106,7 +108,7 @@ if [ -z "$INSTANCE_ID" ]; then
        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
    fi

-    mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
+    mv logs/run_evaluation/$RUN_ID/$RESULT_OUTPUT_DIR_NAME $RESULT_OUTPUT_DIR
    mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
    echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt

--- a/evaluation/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/swe_bench/scripts/eval_infer_remote.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -eo pipefail
+
+INPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET=$3
+SPLIT=$4
+
+if [ -z "$INPUT_FILE" ]; then
+  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
+  exit 1
+fi
+
+if [ -z "$DATASET" ]; then
+  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
+  DATASET="princeton-nlp/SWE-bench_Lite"
+fi
+
+if [ -z "$SPLIT" ]; then
+  echo "SPLIT not specified, use default test"
+  SPLIT="test"
+fi
+
+if [ -z "$NUM_WORKERS" ]; then
+  echo "NUM_WORKERS not specified, use default 1"
+  NUM_WORKERS=1
+fi
+
+echo "... Evaluating on $INPUT_FILE ..."
+
+COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
+  --eval-num-workers $NUM_WORKERS \
+  --input-file $INPUT_FILE \
+  --dataset $DATASET \
+  --split $SPLIT"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -6,7 +6,6 @@ import pathlib
 import subprocess
 import time
 import traceback
-from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Any, Awaitable, Callable, TextIO

 import pandas as pd
@@ -50,15 +49,16 @@ class EvalMetadata(BaseModel):
 class EvalOutput(BaseModel):
    # NOTE: User-specified
    instance_id: str
-    instruction: str
    # output of the evaluation
    # store anything that is needed for the score calculation
    test_result: dict[str, Any]

+    instruction: str | None = None
+
    # Interaction info
-    metadata: EvalMetadata
-    history: list[tuple[dict[str, Any], dict[str, Any]]]
-    metrics: dict[str, Any]
+    metadata: EvalMetadata | None = None
+    history: list[tuple[dict[str, Any], dict[str, Any]]] | None = None
+    metrics: dict[str, Any] | None = None
    error: str | None = None

    # Optionally save the input test instance
@@ -66,15 +66,19 @@ class EvalOutput(BaseModel):

    def model_dump(self, *args, **kwargs):
        dumped_dict = super().model_dump(*args, **kwargs)
+        # Remove None values
+        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        dumped_dict['metadata'] = self.metadata.model_dump()
+        if self.metadata is not None:
+            dumped_dict['metadata'] = self.metadata.model_dump()
        return dumped_dict

    def model_dump_json(self, *args, **kwargs):
        dumped = super().model_dump_json(*args, **kwargs)
        dumped_dict = json.loads(dumped)
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
+        if 'metadata' in dumped_dict:
+            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
        return json.dumps(dumped_dict)


@@ -260,32 +264,46 @@ def _process_instance_wrapper(
            result = process_instance_func(instance, metadata, use_mp)
            return result
        except Exception as e:
+            error = str(e)
+            stacktrace = traceback.format_exc()
            if attempt == max_retries:
+                logger.exception(e)
+                msg = (
+                    '-' * 10
+                    + '\n'
+                    + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
+                    + '\n'
+                    + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
+                    + '-' * 10
+                )
                # Raise an error after all retries & stop the evaluation
                raise RuntimeError(
                    f'Maximum error retries reached for instance {instance.instance_id}'
                ) from e
-            error = str(e)
-            stacktrace = traceback.format_exc()
            msg = (
                '-' * 10
                + '\n'
                + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
                + '\n'
                + '-' * 10
-                + '[This error occurred after maximum retries]'
+                + f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
                + '-' * 10
                + '\n'
            )
            logger.error(msg)
            if use_mp:
                print(msg)  # use print to directly print to console
-            time.sleep(1)  # Add a small delay before retrying
+            time.sleep(5)
+
+
+def _process_instance_wrapper_mp(args):
+    """Wrapper for multiprocessing, especially for imap_unordered."""
+    return _process_instance_wrapper(*args)


 def run_evaluation(
    dataset: pd.DataFrame,
-    metadata: EvalMetadata,
+    metadata: EvalMetadata | None,
    output_file: str,
    num_workers: int,
    process_instance_func: Callable[
@@ -294,10 +312,14 @@ def run_evaluation(
    max_retries: int = 5,  # number of retries for each instance
 ):
    use_multiprocessing = num_workers > 1
-    logger.info(
-        f'Evaluation started with Agent {metadata.agent_class}:\n'
-        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
-    )
+
+    if metadata is not None:
+        logger.info(
+            f'Evaluation started with Agent {metadata.agent_class}:\n'
+            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
+        )
+    else:
+        logger.info(f'Evaluation started with {num_workers} workers.')

    total_instances = len(dataset)
    pbar = tqdm(total=total_instances, desc='Instances processed')
@@ -305,20 +327,13 @@ def run_evaluation(

    try:
        if use_multiprocessing:
-            with ProcessPoolExecutor(num_workers) as executor:
-                futures = [
-                    executor.submit(
-                        _process_instance_wrapper,
-                        process_instance_func=process_instance_func,
-                        instance=instance,
-                        metadata=metadata,
-                        use_mp=True,
-                        max_retries=max_retries,
-                    )
+            with mp.Pool(num_workers) as pool:
+                args_iter = (
+                    (process_instance_func, instance, metadata, True, max_retries)
                    for _, instance in dataset.iterrows()
-                ]
-                for future in as_completed(futures):
-                    result = future.result()
+                )
+                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
+                for result in results:
                    update_progress(result, pbar, output_fp)
        else:
            for _, instance in dataset.iterrows():
@@ -355,18 +370,27 @@ def reset_logger_for_multiprocessing(
    # Remove all existing handlers from logger
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
-    # add back the console handler to print ONE line
-    logger.addHandler(get_console_handler())
+
+    # add console handler to print ONE line
+    console_handler = get_console_handler(log_level=logging.INFO)
+    console_handler.setFormatter(
+        logging.Formatter(
+            f'Instance {instance_id} - ' + '%(asctime)s - %(levelname)s - %(message)s'
+        )
+    )
+    logger.addHandler(console_handler)
    logger.info(
        f'Starting evaluation for instance {instance_id}.\n'
        f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
    )
-    # Remove all existing handlers from logger
-    for handler in logger.handlers[:]:
-        logger.removeHandler(handler)
+    # Only log WARNING or higher to console
+    console_handler.setLevel(logging.WARNING)
+
+    # Log INFO and above to file
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    )
+    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -183,7 +183,8 @@ class SandboxConfig:
    """Configuration for the sandbox.

    Attributes:
-        api_hostname: The hostname for the EventStream Runtime API.
+        remote_runtime_api_url: The hostname for the Remote Runtime API.
+        local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments
        base_container_image: The base container image from which to build the runtime image.
        runtime_container_image: The runtime container image to use.
        user_id: The user ID for the sandbox.
@@ -204,7 +205,8 @@ class SandboxConfig:
            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
    """

-    api_hostname: str = 'localhost'
+    remote_runtime_api_url: str = 'http://localhost:8000'
+    local_runtime_url: str = 'http://localhost'
    api_key: str | None = None
    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
    runtime_container_image: str | None = None
@@ -755,6 +757,18 @@ def get_parser() -> argparse.ArgumentParser:
        type=str,
        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
    )
+    # Map-reduce arguments for evaluation
+    parser.add_argument(
+        '--eval-map-reduce-write-inputs',
+        action='store_true',
+        help='write inputs to output_dir/mr_inputs',
+    )
+    parser.add_argument(
+        '--eval-map-reduce-read-input-file',
+        type=str,
+        default=None,
+        help='read input (arguments for process_instance) from this file, run it, and write output to output_dir/mr_outputs',
+    )
    return parser


--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -13,6 +13,7 @@ from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
 from litellm.exceptions import (
    APIConnectionError,
+    APIError,
    ContentPolicyViolationError,
    InternalServerError,
    NotFoundError,
@@ -37,6 +38,14 @@ from openhands.core.metrics import Metrics
 __all__ = ['LLM']

 message_separator = '\n\n----------\n\n'
+# tuple of exceptions to retry on
+LLM_RETRY_EXCEPTIONS: tuple[type[Exception], ...] = (
+    APIConnectionError,
+    APIError,
+    InternalServerError,
+    RateLimitError,
+    ServiceUnavailableError,
+)

 cache_prompting_supported_models = [
    'claude-3-5-sonnet-20240620',
--- a/openhands/runtime/builder/docker.py
+++ b/openhands/runtime/builder/docker.py
@@ -36,14 +36,12 @@ class DockerRuntimeBuilder(RuntimeBuilder):

        logger.info(f'Image [{target_image_hash_name}] build finished.')

-        assert (
-            target_image_tag
-        ), f'Expected target image tag [{target_image_tag}] is None'
-        image = self.docker_client.images.get(target_image_hash_name)
-        image.tag(target_image_repo, target_image_tag)
-        logger.info(
-            f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
-        )
+        if target_image_tag:
+            image = self.docker_client.images.get(target_image_hash_name)
+            image.tag(target_image_repo, target_image_tag)
+            logger.info(
+                f'Re-tagged image [{target_image_hash_name}] with more generic tag [{target_image_tag}]'
+            )

        # Check if the image is built successfully
        image = self.docker_client.images.get(target_image_hash_name)
--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@@ -124,9 +124,7 @@ class EventStreamRuntime(Runtime):
        self.config = config
        self._host_port = 30000  # initial dummy value
        self._container_port = 30001  # initial dummy value
-        self.api_url = (
-            f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
-        )
+        self.api_url = f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
        self.session = requests.Session()
        self.instance_id = (
            sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
@@ -212,7 +210,7 @@ class EventStreamRuntime(Runtime):
                self._host_port
            )  # in future this might differ from host port
            self.api_url = (
-                f'http://{self.config.sandbox.api_hostname}:{self._container_port}'
+                f'{self.config.sandbox.local_runtime_url}:{self._container_port}'
            )

            use_host_network = self.config.sandbox.use_host_network
@@ -419,7 +417,8 @@ class EventStreamRuntime(Runtime):
                response = self.session.post(
                    f'{self.api_url}/execute_action',
                    json={'action': event_to_dict(action)},
-                    timeout=action.timeout,
+                    # wait a few more seconds to get the timeout error from client side
+                    timeout=action.timeout + 5,
                )
                if response.status_code == 200:
                    output = response.json()
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@@ -57,13 +57,6 @@ class RemoteRuntime(Runtime):
        env_vars: dict[str, str] | None = None,
    ):
        self.config = config
-        if self.config.sandbox.api_hostname == 'localhost':
-            self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
-            logger.warning(
-                'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
-                'Setting it to default value: api.all-hands.dev/v0/runtime'
-            )
-        self.api_url = f'https://{self.config.sandbox.api_hostname.rstrip("/")}'

        if self.config.sandbox.api_key is None:
            raise ValueError(
@@ -80,7 +73,7 @@ class RemoteRuntime(Runtime):
            )

        self.runtime_builder = RemoteRuntimeBuilder(
-            self.api_url, self.config.sandbox.api_key
+            self.config.sandbox.remote_runtime_api_url, self.config.sandbox.api_key
        )
        self.runtime_id: str | None = None
        self.runtime_url: str | None = None
@@ -95,7 +88,11 @@ class RemoteRuntime(Runtime):
        self.container_image: str = self.config.sandbox.base_container_image
        self.container_name = 'oh-remote-runtime-' + self.instance_id
        logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
-        response = send_request(self.session, 'GET', f'{self.api_url}/registry_prefix')
+        response = send_request(
+            self.session,
+            'GET',
+            f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
+        )
        response_json = response.json()
        registry_prefix = response_json['registry_prefix']
        os.environ['OH_RUNTIME_RUNTIME_IMAGE_REPO'] = (
@@ -121,7 +118,7 @@ class RemoteRuntime(Runtime):
        response = send_request(
            self.session,
            'GET',
-            f'{self.api_url}/image_exists',
+            f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
            params={'image': self.container_image},
        )
        if response.status_code != 200 or not response.json()['exists']:
@@ -155,7 +152,10 @@ class RemoteRuntime(Runtime):

        # Start the sandbox using the /start endpoint
        response = send_request(
-            self.session, 'POST', f'{self.api_url}/start', json=start_request
+            self.session,
+            'POST',
+            f'{self.config.sandbox.remote_runtime_api_url}/start',
+            json=start_request,
        )
        if response.status_code != 201:
            raise RuntimeError(f'Failed to start sandbox: {response.text}')
@@ -197,6 +197,8 @@ class RemoteRuntime(Runtime):
            # because the runtime might just be starting up
            # and have not registered the endpoint yet
            retry_fns=[is_404_error],
+            # leave enough time for the runtime to start up
+            timeout=600,
        )
        if response.status_code != 200:
            msg = f'Runtime is not alive yet (id={self.runtime_id}). Status: {response.status_code}.'
@@ -209,7 +211,7 @@ class RemoteRuntime(Runtime):
                response = send_request(
                    self.session,
                    'POST',
-                    f'{self.api_url}/stop',
+                    f'{self.config.sandbox.remote_runtime_api_url}/stop',
                    json={'runtime_id': self.runtime_id},
                )
                if response.status_code != 200:
@@ -248,7 +250,8 @@ class RemoteRuntime(Runtime):
                    'POST',
                    f'{self.runtime_url}/execute_action',
                    json=request_body,
-                    timeout=action.timeout,
+                    # wait a few more seconds to get the timeout error from client side
+                    timeout=action.timeout + 5,
                    retry_exceptions=list(
                        filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
                    ),
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@@ -370,10 +370,16 @@ def _build_sandbox_image(
    target_image_hash_name = f'{target_image_repo}:{target_image_hash_tag}'
    target_image_generic_name = f'{target_image_repo}:{target_image_tag}'

+    tags_to_add = [target_image_hash_name]
+
+    # Only add the generic tag if the image does not exist
+    # so it does not get overwritten & only points to the earliest version
+    # to avoid "too many layers" after many re-builds
+    if not runtime_builder.image_exists(target_image_generic_name):
+        tags_to_add.append(target_image_generic_name)
+
    try:
-        image_name = runtime_builder.build(
-            path=docker_folder, tags=[target_image_hash_name, target_image_generic_name]
-        )
+        image_name = runtime_builder.build(path=docker_folder, tags=tags_to_add)
        if not image_name:
            raise RuntimeError(f'Build failed for image {target_image_hash_name}')
    except Exception as e:
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@@ -66,8 +66,7 @@ RUN \
    /openhands/miniforge3/bin/mamba run -n base poetry run pip install playwright && \
    /openhands/miniforge3/bin/mamba run -n base poetry run playwright install --with-deps chromium && \
    # Set environment variables
-    export OH_INTERPRETER_PATH=$(/openhands/miniforge3/bin/mamba run -n base poetry run python -c "import sys; print(sys.executable)") && \
-    export OH_VENV_PATH=$(/openhands/miniforge3/bin/mamba run -n base poetry env info --path) && \
+    echo "OH_INTERPRETER_PATH=$(/openhands/miniforge3/bin/mamba run -n base poetry run python -c "import sys; print(sys.executable)")" >> /etc/environment && \
    # Install extra dependencies if specified
    {{ extra_deps }} {% if extra_deps %} && {% endif %} \
    # Clear caches
@@ -78,16 +77,6 @@ RUN \
    # Clean up
    apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    /openhands/miniforge3/bin/mamba clean --all
-{% if not skip_init %}
-RUN \
-    # Add the Poetry virtual environment to the bashrc
-    echo "export OH_INTERPRETER_PATH=\"$OH_INTERPRETER_PATH\"" >> /etc/bash.bashrc && \
-    echo "export OH_VENV_PATH=\"$OH_VENV_PATH\"" >> /etc/bash.bashrc && \
-    # Activate the Poetry virtual environment
-    echo 'source "$OH_VENV_PATH/bin/activate"' >> /etc/bash.bashrc && \
-    # Use the Poetry virtual environment's Python interpreter
-    echo 'alias python="$OH_INTERPRETER_PATH"' >> /etc/bash.bashrc
-{% endif %}
 # ================================================================
 # END: Copy Project and Install/Update Dependencies
 # ================================================================
--- a/poetry.lock
+++ b/poetry.lock
@@ -3761,13 +3761,13 @@ types-tqdm = "*"

 [[package]]
 name = "litellm"
-version = "1.46.1"
+version = "1.48.9"
 description = "Library to easily interface with LLM API providers"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.46.1-py3-none-any.whl", hash = "sha256:f6b78278cf21a38da0d10a8b3e7b1084b6410012552c0a413774d1c43706e5ba"},
-    {file = "litellm-1.46.1.tar.gz", hash = "sha256:993c23d6f5e1d0f070b250d858a6ee87750a032e38f460f8c82385be854bc45f"},
+    {file = "litellm-1.48.9-py3-none-any.whl", hash = "sha256:9608f510e82c27b15bab7bcfab5e1308055f0c457e7881ccfff91c189bf2c055"},
+    {file = "litellm-1.48.9.tar.gz", hash = "sha256:02dd2f66fab24f388692694401bbabd34de5a62a16d064b3f15726a550a65cd3"},
 ]

 [package.dependencies]
@@ -9675,4 +9675,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "5acb0e1ac5538c10add8f72b0f5c2762bea1a08cce7548deccd263934f043cfb"
+content-hash = "96a302abea5291a44d97c2e4c813a8db2e6f3b1327b1c4f7dbf6d00eb8e19560"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ packages = [
 python = "^3.11"
 datasets = "*"
 pandas = "*"
-litellm = "*"
+litellm = "^1.48.6"
 google-generativeai = "*" # To use litellm with Gemini Pro API
 termcolor = "*"
 seaborn = "*"
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@@ -57,6 +57,31 @@ def test_bash_command_pexcept(temp_dir, box_class, run_as_openhands):
        _close_test_runtime(runtime)


+def test_bash_timeout_and_keyboard_interrupt(temp_dir, box_class, run_as_openhands):
+    runtime = _load_runtime(temp_dir, box_class, run_as_openhands)
+    try:
+        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert (
+            '[Command timed out after 1 seconds. SIGINT was sent to interrupt it.]'
+            in obs.content
+        )
+        assert 'KeyboardInterrupt' in obs.content
+
+        # follow up command should not be affected
+        action = CmdRunAction(command='ls')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    finally:
+        _close_test_runtime(runtime)
+
+
 def test_multiline_commands(temp_dir, box_class):
    runtime = _load_runtime(temp_dir, box_class)
    try:
Author	SHA1	Message	Date
Engel Nyst	7da6e06da6	Retry on litellm's APIError, which includes 502 (#4167 )	2024-10-03 03:00:58 +00:00
Xingyao Wang	c2223a0fe4	upgrade litellm	2024-10-02 19:49:22 +00:00
Xingyao Wang	f2a48a870c	fix wrong import	2024-10-02 19:49:12 +00:00
Xingyao Wang	61d99e9e37	add few seconds to properly receive timeout error from client	2024-10-02 04:08:02 +00:00
Xingyao Wang	9af6399a90	make target_image_tag optional	2024-10-02 01:01:16 +00:00
Xingyao Wang	ac1459b0c9	Update instruction for new version of eval runtime-api (#4128 )	2024-10-01 19:15:37 +00:00
Xingyao Wang	e5c5e1c4e5	bump to new runtime w/o parallel	2024-10-01 17:03:57 +00:00
Xingyao Wang	cc03b59238	fix eval_infer.sh	2024-09-29 21:02:49 +00:00
Xingyao Wang	6999d969bb	[eval] log evaluating warnings directly to console (#4026 )	2024-09-28 05:35:57 +00:00
tobitege	f446237081	revert #3871 dockerfile template: don't write to .bashrc file (#4095 )	2024-09-28 05:22:36 +00:00
Xingyao Wang	891b02d1ce	[runtime] do not keep rebuilding from generic image (#4072 )	2024-09-27 21:15:57 +00:00
Xingyao Wang	78cbd90df0	parser fix for deepseek	2024-09-27 21:10:14 +00:00
Xingyao Wang	4ae0a3c887	change to imap_unordered	2024-09-24 20:32:33 +00:00
Xingyao Wang	6d9385baa2	try fix mp again	2024-09-24 20:32:30 +00:00
Xingyao Wang	7eb44cdeff	use mp Pool instead ProcessPoolExecutor	2024-09-24 14:03:24 +00:00
Xingyao Wang	5a64cf2bca	fix log copy failure	2024-09-20 19:33:29 +00:00
Xingyao Wang	b24a7821ec	[eval] fix evaluation git patch post-processing (#3979 )	2024-09-20 22:55:43 +08:00
Xingyao Wang	caa0f03c7b	Merge commit 'e0f91f2aef053e8ae5c8f78539f086a01346c10e' into eval/24-sep	2024-09-18 16:01:49 +00:00
Xingyao Wang	e0f91f2aef	Update evaluation/swe_bench/eval_infer.py Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-09-18 22:36:57 +08:00
Xingyao Wang	5d1355ffa0	Update evaluation/swe_bench/README.md Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-09-18 22:36:50 +08:00
Xingyao Wang	4c3068c711	Merge branch 'main' into xw/eval-swebench	2024-09-18 08:40:07 -05:00
Xingyao Wang	68b2152942	update output	2024-09-18 13:34:51 +00:00
Xingyao Wang	b7416a4723	print retry time as well	2024-09-18 01:46:43 +00:00
Xingyao Wang	770af8d74b	Revert "bump timeout" This reverts commit `c92cbbb201`.	2024-09-17 22:29:15 +00:00
Xingyao Wang	090f0df452	only increase timeout for /alive	2024-09-17 22:29:01 +00:00
Xingyao Wang	c92cbbb201	bump timeout	2024-09-17 22:25:51 +00:00
Xingyao Wang	ee37af93a1	sleep longer for eval retry	2024-09-17 20:42:11 +00:00
Xingyao Wang	e09e8b4ebf	improve runtime cleanup script	2024-09-17 19:26:41 +00:00
Xingyao Wang	b96d798efa	fix reset logger for n-p=1	2024-09-17 19:18:58 +00:00
Xingyao Wang	9a9d376772	save infer logs as well	2024-09-17 15:46:50 +00:00
Xingyao Wang	9e2a693ed4	save relavant info; remove extra logging	2024-09-17 15:43:30 +00:00
Xingyao Wang	cc3c34c90a	fix eval	2024-09-17 15:40:07 +00:00
Xingyao Wang	279443a563	fix missing log path	2024-09-17 15:06:31 +00:00
Xingyao Wang	8a9d9576a9	use polling to get updates to avoid timeout	2024-09-17 15:03:26 +00:00
Xingyao Wang	79867629db	Merge commit '963f0db6ab7b24a2f45a2692aa948f190d49cac6' into xw/eval-swebench	2024-09-17 14:50:42 +00:00
Xingyao Wang	963f0db6ab	Update evaluation/utils/shared.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>	2024-09-17 21:42:28 +08:00
Xingyao Wang	4e93a24e44	Update evaluation/utils/shared.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>	2024-09-17 21:42:20 +08:00
Xingyao Wang	20722da8ca	update output filename	2024-09-17 02:08:54 +00:00
Xingyao Wang	b02c98f683	add download_gold_patch	2024-09-17 02:08:32 +00:00
Xingyao Wang	44b5bffd34	fix copy_to	2024-09-17 02:08:18 +00:00
Xingyao Wang	b720eceb59	fix eval_infer command	2024-09-17 02:00:00 +00:00
Xingyao Wang	fb6da23220	set max retries to one for eval_infer	2024-09-17 01:39:32 +00:00
Xingyao Wang	d843fb8bab	Merge commit '33c5cdeb9365ca1d7a9dba92c3476dde951ff5c4' into xw/eval-swebench	2024-09-17 01:39:12 +00:00
Xingyao Wang	33c5cdeb93	remove EvalError and allow passing max_retries	2024-09-17 01:39:04 +00:00
Xingyao Wang	460aa3acbd	only dump keys that exists	2024-09-17 01:37:52 +00:00
Xingyao Wang	4ae8f9cf05	stop print the exact patch	2024-09-17 01:35:32 +00:00
Xingyao Wang	2c7b214a74	print final number	2024-09-17 01:34:55 +00:00
Xingyao Wang	283ef9becc	fix metadata dump	2024-09-17 01:32:09 +00:00
Xingyao Wang	369ceecc63	support evaluate via remote runtime	2024-09-17 01:24:33 +00:00
Xingyao Wang	fe5a67e96d	Merge branch 'main' into xw/eval-fix	2024-09-16 20:15:34 -05:00
Xingyao Wang	cf5da84b6f	increase timeout for instance entry	2024-09-16 22:23:59 +00:00
Xingyao Wang	a314309b57	Merge commit 'a42cc05481b68cb6c1306becb3f7b885667dbf04' into xw/eval-swebench	2024-09-16 21:11:39 +00:00
Xingyao Wang	a42cc05481	only update progress on main loop	2024-09-16 21:01:58 +00:00
Xingyao Wang	e0cdaa2a58	allow set EXP_NAME when run_infer.sh	2024-09-16 20:59:13 +00:00
Xingyao Wang	5fa8fde2f0	[eval] simplify eval error & retry again	2024-09-16 20:58:59 +00:00