change to imap_unordered

try fix mp again
use mp Pool instead ProcessPoolExecutor
2026-04-29 03:00:45 -04:00 · 2024-09-24 20:33:04 +00:00 · 2024-09-24 20:33:01 +00:00 · 2024-09-24 17:34:58 +00:00 · 2024-09-24 17:34:56 +00:00 · 2024-09-24 17:34:52 +00:00
9 changed files with 150 additions and 126 deletions
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -204,7 +204,7 @@ class CodeActAgent(Agent):
                '</execute_bash>',
                '</execute_browse>',
            ],
-            'temperature': 0.0,
+            'temperature': self.llm.config.temperature,
        }

        if self.llm.is_caching_prompt_active():
--- a/config.template.toml
+++ b/config.template.toml
@@ -159,7 +159,7 @@ model = "gpt-4o"
 #timeout = 0

 # Top p for the API
-#top_p = 0.5
+#top_p = 1.0

 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -120,7 +120,6 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        max_budget_per_task=4,
        max_iterations=metadata.max_iterations,
        runtime=os.environ.get('RUNTIME', 'eventstream'),
        sandbox=SandboxConfig(
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -11,6 +11,7 @@ MAX_ITER=$5
 NUM_WORKERS=$6
 DATASET=$7
 SPLIT=$8
+N_RUNS=$9

 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
@@ -73,22 +74,38 @@ echo "EVAL_NOTE: $EVAL_NOTE"

 unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push

-COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
-  --agent-cls $AGENT \
-  --llm-config $MODEL_CONFIG \
-  --max-iterations $MAX_ITER \
-  --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
-  --eval-note $EVAL_NOTE \
-  --dataset $DATASET \
-  --split $SPLIT"
+run_inference() {
+    local run_eval_note=$1
+    echo "RUN_EVAL_NOTE: $run_eval_note"

-if [ -n "$EVAL_LIMIT" ]; then
-  echo "EVAL_LIMIT: $EVAL_LIMIT"
-  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+    local command="poetry run python evaluation/swe_bench/run_infer.py \
+        --agent-cls $AGENT \
+        --llm-config $MODEL_CONFIG \
+        --max-iterations $MAX_ITER \
+        --max-chars 10000000 \
+        --eval-num-workers $NUM_WORKERS \
+        --eval-note $run_eval_note \
+        --dataset $DATASET \
+        --split $SPLIT"
+
+    if [ -n "$EVAL_LIMIT" ]; then
+        echo "EVAL_LIMIT: $EVAL_LIMIT"
+        command="$command --eval-n-limit $EVAL_LIMIT"
+    fi
+
+    # Run the command
+    eval $command
+}
+
+if [ -n "$N_RUNS" ]; then
+    echo "Running the same experiment $N_RUNS times and save results to different directories"
+    for i in $(seq 1 $N_RUNS); do
+        RUN_EVAL_NOTE="$EVAL_NOTE-run_$i"
+        echo "Running iteration $i of $N_RUNS"
+        run_inference "$RUN_EVAL_NOTE"
+    done
+else
+    run_inference "$EVAL_NOTE"
 fi

-# Run the command
-eval $COMMAND
-
 checkout_original_branch
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -6,7 +6,6 @@ import pathlib
 import subprocess
 import time
 import traceback
-from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Any, Awaitable, Callable, TextIO

 import pandas as pd
@@ -297,6 +296,11 @@ def _process_instance_wrapper(
            time.sleep(5)


+def _process_instance_wrapper_mp(args):
+    """Wrapper for multiprocessing, especially for imap_unordered."""
+    return _process_instance_wrapper(*args)
+
+
 def run_evaluation(
    dataset: pd.DataFrame,
    metadata: EvalMetadata | None,
@@ -323,20 +327,13 @@ def run_evaluation(

    try:
        if use_multiprocessing:
-            with ProcessPoolExecutor(num_workers) as executor:
-                futures = [
-                    executor.submit(
-                        _process_instance_wrapper,
-                        process_instance_func=process_instance_func,
-                        instance=instance,
-                        metadata=metadata,
-                        use_mp=True,
-                        max_retries=max_retries,
-                    )
+            with mp.Pool(num_workers) as pool:
+                args_iter = (
+                    (process_instance_func, instance, metadata, True, max_retries)
                    for _, instance in dataset.iterrows()
-                ]
-                for future in as_completed(futures):
-                    result = future.result()
+                )
+                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
+                for result in results:
                    update_progress(result, pbar, output_fp)
        else:
            for _, instance in dataset.iterrows():
@@ -373,18 +370,24 @@ def reset_logger_for_multiprocessing(
    # Remove all existing handlers from logger
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
-    # add back the console handler to print ONE line
-    logger.addHandler(get_console_handler())
+
+    # add console handler to print ONE line
+    console_handler = get_console_handler(
+        log_level=logging.INFO, extra_info=f'Instance {instance_id}'
+    )
+    logger.addHandler(console_handler)
    logger.info(
        f'Starting evaluation for instance {instance_id}.\n'
        f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
    )
-    # Remove all existing handlers from logger
-    for handler in logger.handlers[:]:
-        logger.removeHandler(handler)
+    # Only log WARNING or higher to console
+    console_handler.setLevel(logging.WARNING)
+
+    # Log INFO and above to file
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    )
+    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -71,8 +71,8 @@ class LLMConfig:
    retry_max_wait: int = 120
    timeout: int | None = None
    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
-    temperature: float = 0
-    top_p: float = 0.5
+    temperature: float = 0.0
+    top_p: float = 1.0
    custom_llm_provider: str | None = None
    max_input_tokens: int | None = None
    max_output_tokens: int | None = None
--- a/openhands/core/logger.py
+++ b/openhands/core/logger.py
@@ -117,11 +117,14 @@ class SensitiveDataFilter(logging.Filter):
        return True


-def get_console_handler(log_level=logging.INFO):
+def get_console_handler(log_level=logging.INFO, extra_info: str | None = None):
    """Returns a console handler for logging."""
    console_handler = logging.StreamHandler()
    console_handler.setLevel(log_level)
-    console_handler.setFormatter(console_formatter)
+    formatter_str = '%(asctime)s - %(levelname)s - %(message)s'
+    if extra_info:
+        formatter_str = f'{extra_info} - ' + formatter_str
+    console_handler.setFormatter(logging.Formatter(formatter_str))
    return console_handler


--- a/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
+++ b/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
@@ -511,94 +511,96 @@ def _edit_file_impl(
        # because the env var will be set AFTER the agentskills is imported
        if enable_auto_lint:
            # BACKUP the original file
-            original_file_backup_path = os.path.join(
-                os.path.dirname(file_name),
-                f'.backup.{os.path.basename(file_name)}',
-            )
-            with open(original_file_backup_path, 'w') as f:
-                f.writelines(lines)
-
-            lint_error, first_error_line = _lint_file(file_name)
-
-            # Select the errors caused by the modification
-            def extract_last_part(line):
-                parts = line.split(':')
-                if len(parts) > 1:
-                    return parts[-1].strip()
-                return line.strip()
-
-            def subtract_strings(str1, str2) -> str:
-                lines1 = str1.splitlines()
-                lines2 = str2.splitlines()
-
-                last_parts1 = [extract_last_part(line) for line in lines1]
-
-                remaining_lines = [
-                    line
-                    for line in lines2
-                    if extract_last_part(line) not in last_parts1
-                ]
-
-                result = '\n'.join(remaining_lines)
-                return result
-
-            if original_lint_error and lint_error:
-                lint_error = subtract_strings(original_lint_error, lint_error)
-                if lint_error == '':
-                    lint_error = None
-                    first_error_line = None
-
-            if lint_error is not None:
-                if first_error_line is not None:
-                    show_line = int(first_error_line)
-                elif is_append:
-                    # original end-of-file
-                    show_line = len(lines)
-                # insert OR edit WILL provide meaningful line numbers
-                elif start is not None and end is not None:
-                    show_line = int((start + end) / 2)
-                else:
-                    raise ValueError('Invalid state. This should never happen.')
-
-                ret_str += LINTER_ERROR_MSG
-                ret_str += lint_error + '\n'
-
-                editor_lines = n_added_lines + 20
-                sep = '-' * 49 + '\n'
-                ret_str += (
-                    f'[This is how your edit would have looked if applied]\n{sep}'
+            with tempfile.TemporaryDirectory() as temp_dir:
+                original_file_backup_path = os.path.join(
+                    temp_dir,
+                    f'.backup.{os.path.basename(file_name)}',
                )
-                ret_str += (
-                    _print_window(file_name, show_line, editor_lines, return_str=True)
-                    + '\n'
-                )
-                ret_str += f'{sep}\n'
+                with open(original_file_backup_path, 'w') as f:
+                    f.writelines(lines)

-                ret_str += '[This is the original code before your edit]\n'
-                ret_str += sep
-                ret_str += (
-                    _print_window(
-                        original_file_backup_path,
-                        show_line,
-                        editor_lines,
-                        return_str=True,
+                lint_error, first_error_line = _lint_file(file_name)
+
+                # Select the errors caused by the modification
+                def extract_last_part(line):
+                    parts = line.split(':')
+                    if len(parts) > 1:
+                        return parts[-1].strip()
+                    return line.strip()
+
+                def subtract_strings(str1, str2) -> str:
+                    lines1 = str1.splitlines()
+                    lines2 = str2.splitlines()
+
+                    last_parts1 = [extract_last_part(line) for line in lines1]
+
+                    remaining_lines = [
+                        line
+                        for line in lines2
+                        if extract_last_part(line) not in last_parts1
+                    ]
+
+                    result = '\n'.join(remaining_lines)
+                    return result
+
+                if original_lint_error and lint_error:
+                    lint_error = subtract_strings(original_lint_error, lint_error)
+                    if lint_error == '':
+                        lint_error = None
+                        first_error_line = None
+
+                if lint_error is not None:
+                    if first_error_line is not None:
+                        show_line = int(first_error_line)
+                    elif is_append:
+                        # original end-of-file
+                        show_line = len(lines)
+                    # insert OR edit WILL provide meaningful line numbers
+                    elif start is not None and end is not None:
+                        show_line = int((start + end) / 2)
+                    else:
+                        raise ValueError('Invalid state. This should never happen.')
+
+                    ret_str += LINTER_ERROR_MSG
+                    ret_str += lint_error + '\n'
+
+                    editor_lines = n_added_lines + 20
+                    sep = '-' * 49 + '\n'
+                    ret_str += (
+                        f'[This is how your edit would have looked if applied]\n{sep}'
                    )
-                    + '\n'
-                )
-                ret_str += sep
-                ret_str += (
-                    'Your changes have NOT been applied. Please fix your edit command and try again.\n'
-                    'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
-                    'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
-                )
+                    ret_str += (
+                        _print_window(
+                            file_name, show_line, editor_lines, return_str=True
+                        )
+                        + '\n'
+                    )
+                    ret_str += f'{sep}\n'

-                # recover the original file
-                with open(original_file_backup_path) as fin, open(
-                    file_name, 'w'
-                ) as fout:
-                    fout.write(fin.read())
-                os.remove(original_file_backup_path)
-                return ret_str
+                    ret_str += '[This is the original code before your edit]\n'
+                    ret_str += sep
+                    ret_str += (
+                        _print_window(
+                            original_file_backup_path,
+                            show_line,
+                            editor_lines,
+                            return_str=True,
+                        )
+                        + '\n'
+                    )
+                    ret_str += sep
+                    ret_str += (
+                        'Your changes have NOT been applied. Please fix your edit command and try again.\n'
+                        'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
+                        'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
+                    )
+
+                    # recover the original file
+                    with open(original_file_backup_path) as fin, open(
+                        file_name, 'w'
+                    ) as fout:
+                        fout.write(fin.read())
+                    return ret_str

    except FileNotFoundError as e:
        ret_str += f'File not found: {e}\n'
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@@ -59,7 +59,7 @@ class RemoteRuntime(Runtime):
        self.config = config
        if self.config.sandbox.api_hostname == 'localhost':
            self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
-            logger.warning(
+            logger.info(
                'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
                'Setting it to default value: api.all-hands.dev/v0/runtime'
            )
Author	SHA1	Message	Date
Xingyao Wang	932af1af7f	change to imap_unordered	2024-09-24 20:33:04 +00:00
Xingyao Wang	b7d5ef2c7a	try fix mp again	2024-09-24 20:33:01 +00:00
Xingyao Wang	6bced445eb	use mp Pool instead ProcessPoolExecutor	2024-09-24 17:34:58 +00:00
Xingyao Wang	68e9914238	convert warning to default for remoteruntime	2024-09-24 17:34:56 +00:00
Xingyao Wang	76b56af656	improve logging for eval: log warning and above directly to console	2024-09-24 17:34:52 +00:00
Xingyao Wang	62ef5ba54e	Merge commit '6c6ebbdc58b3950d01bdb791703955d3d255a93d' into eval/24-sep-exp	2024-09-21 02:32:05 +00:00
Xingyao Wang	6c6ebbdc58	update temperature in config toml	2024-09-20 22:31:32 -04:00
Xingyao Wang	36984f15be	set temperature back to 0	2024-09-20 22:29:56 -04:00
Xingyao Wang	bdef074e31	make codeact take the temp from config	2024-09-20 22:29:40 -04:00
Xingyao Wang	4fe97b7a2d	update config.toml	2024-09-20 21:52:53 -04:00
Xingyao Wang	4cc4004d44	change top_p & temperature default value to 1.0	2024-09-20 20:42:45 -04:00
Xingyao Wang	b06a5a6a00	Merge commit 'd006a6101e65a6020b728ea2ac9cefe30c10b549' into eval/24-sep-exp	2024-09-20 19:57:37 +00:00
Xingyao Wang	d006a6101e	use temp dir	2024-09-20 15:50:36 -04:00
Xingyao Wang	22bc1a80e1	Merge commit 'b24a7821ec5dbafd826d204c2f1197f8d0eb19e2' into eval/24-sep-exp	2024-09-20 19:18:19 +00:00
Xingyao Wang	b24a7821ec	[eval] fix evaluation git patch post-processing (#3979 )	2024-09-20 22:55:43 +08:00
Xingyao Wang	879f9f31e2	fix backup edit	2024-09-19 16:09:57 +00:00
Xingyao Wang	51c6ce398d	support run infer in multiple runs	2024-09-18 19:59:48 +00:00
Xingyao Wang	5b7e4c52c8	remove budget constraint	2024-09-18 16:33:44 +00:00
Xingyao Wang	caa0f03c7b	Merge commit 'e0f91f2aef053e8ae5c8f78539f086a01346c10e' into eval/24-sep	2024-09-18 16:01:49 +00:00
Xingyao Wang	e0f91f2aef	Update evaluation/swe_bench/eval_infer.py Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-09-18 22:36:57 +08:00
Xingyao Wang	5d1355ffa0	Update evaluation/swe_bench/README.md Co-authored-by: Graham Neubig <neubig@gmail.com>	2024-09-18 22:36:50 +08:00
Xingyao Wang	4c3068c711	Merge branch 'main' into xw/eval-swebench	2024-09-18 08:40:07 -05:00
Xingyao Wang	68b2152942	update output	2024-09-18 13:34:51 +00:00
Xingyao Wang	b7416a4723	print retry time as well	2024-09-18 01:46:43 +00:00
Xingyao Wang	770af8d74b	Revert "bump timeout" This reverts commit `c92cbbb201`.	2024-09-17 22:29:15 +00:00
Xingyao Wang	090f0df452	only increase timeout for /alive	2024-09-17 22:29:01 +00:00
Xingyao Wang	c92cbbb201	bump timeout	2024-09-17 22:25:51 +00:00
Xingyao Wang	ee37af93a1	sleep longer for eval retry	2024-09-17 20:42:11 +00:00
Xingyao Wang	e09e8b4ebf	improve runtime cleanup script	2024-09-17 19:26:41 +00:00
Xingyao Wang	b96d798efa	fix reset logger for n-p=1	2024-09-17 19:18:58 +00:00
Xingyao Wang	9a9d376772	save infer logs as well	2024-09-17 15:46:50 +00:00
Xingyao Wang	9e2a693ed4	save relavant info; remove extra logging	2024-09-17 15:43:30 +00:00
Xingyao Wang	cc3c34c90a	fix eval	2024-09-17 15:40:07 +00:00
Xingyao Wang	279443a563	fix missing log path	2024-09-17 15:06:31 +00:00
Xingyao Wang	8a9d9576a9	use polling to get updates to avoid timeout	2024-09-17 15:03:26 +00:00
Xingyao Wang	79867629db	Merge commit '963f0db6ab7b24a2f45a2692aa948f190d49cac6' into xw/eval-swebench	2024-09-17 14:50:42 +00:00
Xingyao Wang	963f0db6ab	Update evaluation/utils/shared.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>	2024-09-17 21:42:28 +08:00
Xingyao Wang	4e93a24e44	Update evaluation/utils/shared.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>	2024-09-17 21:42:20 +08:00
Xingyao Wang	20722da8ca	update output filename	2024-09-17 02:08:54 +00:00
Xingyao Wang	b02c98f683	add download_gold_patch	2024-09-17 02:08:32 +00:00
Xingyao Wang	44b5bffd34	fix copy_to	2024-09-17 02:08:18 +00:00
Xingyao Wang	b720eceb59	fix eval_infer command	2024-09-17 02:00:00 +00:00
Xingyao Wang	fb6da23220	set max retries to one for eval_infer	2024-09-17 01:39:32 +00:00
Xingyao Wang	d843fb8bab	Merge commit '33c5cdeb9365ca1d7a9dba92c3476dde951ff5c4' into xw/eval-swebench	2024-09-17 01:39:12 +00:00
Xingyao Wang	33c5cdeb93	remove EvalError and allow passing max_retries	2024-09-17 01:39:04 +00:00
Xingyao Wang	460aa3acbd	only dump keys that exists	2024-09-17 01:37:52 +00:00
Xingyao Wang	4ae8f9cf05	stop print the exact patch	2024-09-17 01:35:32 +00:00
Xingyao Wang	2c7b214a74	print final number	2024-09-17 01:34:55 +00:00
Xingyao Wang	283ef9becc	fix metadata dump	2024-09-17 01:32:09 +00:00
Xingyao Wang	369ceecc63	support evaluate via remote runtime	2024-09-17 01:24:33 +00:00
Xingyao Wang	fe5a67e96d	Merge branch 'main' into xw/eval-fix	2024-09-16 20:15:34 -05:00
Xingyao Wang	cf5da84b6f	increase timeout for instance entry	2024-09-16 22:23:59 +00:00
Xingyao Wang	a314309b57	Merge commit 'a42cc05481b68cb6c1306becb3f7b885667dbf04' into xw/eval-swebench	2024-09-16 21:11:39 +00:00
Xingyao Wang	a42cc05481	only update progress on main loop	2024-09-16 21:01:58 +00:00
Xingyao Wang	e0cdaa2a58	allow set EXP_NAME when run_infer.sh	2024-09-16 20:59:13 +00:00
Xingyao Wang	5fa8fde2f0	[eval] simplify eval error & retry again	2024-09-16 20:58:59 +00:00