mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
56 Commits
neubig/cli
...
eval/24-se
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
932af1af7f | ||
|
|
b7d5ef2c7a | ||
|
|
6bced445eb | ||
|
|
68e9914238 | ||
|
|
76b56af656 | ||
|
|
62ef5ba54e | ||
|
|
6c6ebbdc58 | ||
|
|
36984f15be | ||
|
|
bdef074e31 | ||
|
|
4fe97b7a2d | ||
|
|
4cc4004d44 | ||
|
|
b06a5a6a00 | ||
|
|
d006a6101e | ||
|
|
22bc1a80e1 | ||
|
|
b24a7821ec | ||
|
|
879f9f31e2 | ||
|
|
51c6ce398d | ||
|
|
5b7e4c52c8 | ||
|
|
caa0f03c7b | ||
|
|
e0f91f2aef | ||
|
|
5d1355ffa0 | ||
|
|
4c3068c711 | ||
|
|
68b2152942 | ||
|
|
b7416a4723 | ||
|
|
770af8d74b | ||
|
|
090f0df452 | ||
|
|
c92cbbb201 | ||
|
|
ee37af93a1 | ||
|
|
e09e8b4ebf | ||
|
|
b96d798efa | ||
|
|
9a9d376772 | ||
|
|
9e2a693ed4 | ||
|
|
cc3c34c90a | ||
|
|
279443a563 | ||
|
|
8a9d9576a9 | ||
|
|
79867629db | ||
|
|
963f0db6ab | ||
|
|
4e93a24e44 | ||
|
|
20722da8ca | ||
|
|
b02c98f683 | ||
|
|
44b5bffd34 | ||
|
|
b720eceb59 | ||
|
|
fb6da23220 | ||
|
|
d843fb8bab | ||
|
|
33c5cdeb93 | ||
|
|
460aa3acbd | ||
|
|
4ae8f9cf05 | ||
|
|
2c7b214a74 | ||
|
|
283ef9becc | ||
|
|
369ceecc63 | ||
|
|
fe5a67e96d | ||
|
|
cf5da84b6f | ||
|
|
a314309b57 | ||
|
|
a42cc05481 | ||
|
|
e0cdaa2a58 | ||
|
|
5fa8fde2f0 |
@@ -204,7 +204,7 @@ class CodeActAgent(Agent):
|
||||
'</execute_bash>',
|
||||
'</execute_browse>',
|
||||
],
|
||||
'temperature': 0.0,
|
||||
'temperature': self.llm.config.temperature,
|
||||
}
|
||||
|
||||
if self.llm.is_caching_prompt_active():
|
||||
|
||||
@@ -159,7 +159,7 @@ model = "gpt-4o"
|
||||
#timeout = 0
|
||||
|
||||
# Top p for the API
|
||||
#top_p = 0.5
|
||||
#top_p = 1.0
|
||||
|
||||
# If model is vision capable, this option allows to disable image processing (useful for cost reduction).
|
||||
#disable_vision = true
|
||||
|
||||
@@ -120,7 +120,6 @@ def get_config(
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
run_as_openhands=False,
|
||||
max_budget_per_task=4,
|
||||
max_iterations=metadata.max_iterations,
|
||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||
sandbox=SandboxConfig(
|
||||
|
||||
@@ -11,6 +11,7 @@ MAX_ITER=$5
|
||||
NUM_WORKERS=$6
|
||||
DATASET=$7
|
||||
SPLIT=$8
|
||||
N_RUNS=$9
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
@@ -73,22 +74,38 @@ echo "EVAL_NOTE: $EVAL_NOTE"
|
||||
|
||||
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
|
||||
|
||||
COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
run_inference() {
|
||||
local run_eval_note=$1
|
||||
echo "RUN_EVAL_NOTE: $run_eval_note"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
local command="poetry run python evaluation/swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $run_eval_note \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
command="$command --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $command
|
||||
}
|
||||
|
||||
if [ -n "$N_RUNS" ]; then
|
||||
echo "Running the same experiment $N_RUNS times and save results to different directories"
|
||||
for i in $(seq 1 $N_RUNS); do
|
||||
RUN_EVAL_NOTE="$EVAL_NOTE-run_$i"
|
||||
echo "Running iteration $i of $N_RUNS"
|
||||
run_inference "$RUN_EVAL_NOTE"
|
||||
done
|
||||
else
|
||||
run_inference "$EVAL_NOTE"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
|
||||
checkout_original_branch
|
||||
|
||||
@@ -6,7 +6,6 @@ import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from typing import Any, Awaitable, Callable, TextIO
|
||||
|
||||
import pandas as pd
|
||||
@@ -297,6 +296,11 @@ def _process_instance_wrapper(
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def _process_instance_wrapper_mp(args):
|
||||
"""Wrapper for multiprocessing, especially for imap_unordered."""
|
||||
return _process_instance_wrapper(*args)
|
||||
|
||||
|
||||
def run_evaluation(
|
||||
dataset: pd.DataFrame,
|
||||
metadata: EvalMetadata | None,
|
||||
@@ -323,20 +327,13 @@ def run_evaluation(
|
||||
|
||||
try:
|
||||
if use_multiprocessing:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = [
|
||||
executor.submit(
|
||||
_process_instance_wrapper,
|
||||
process_instance_func=process_instance_func,
|
||||
instance=instance,
|
||||
metadata=metadata,
|
||||
use_mp=True,
|
||||
max_retries=max_retries,
|
||||
)
|
||||
with mp.Pool(num_workers) as pool:
|
||||
args_iter = (
|
||||
(process_instance_func, instance, metadata, True, max_retries)
|
||||
for _, instance in dataset.iterrows()
|
||||
]
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
)
|
||||
results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
|
||||
for result in results:
|
||||
update_progress(result, pbar, output_fp)
|
||||
else:
|
||||
for _, instance in dataset.iterrows():
|
||||
@@ -373,18 +370,24 @@ def reset_logger_for_multiprocessing(
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
|
||||
# add console handler to print ONE line
|
||||
console_handler = get_console_handler(
|
||||
log_level=logging.INFO, extra_info=f'Instance {instance_id}'
|
||||
)
|
||||
logger.addHandler(console_handler)
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {instance_id}.\n'
|
||||
f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# Only log WARNING or higher to console
|
||||
console_handler.setLevel(logging.WARNING)
|
||||
|
||||
# Log INFO and above to file
|
||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
@@ -71,8 +71,8 @@ class LLMConfig:
|
||||
retry_max_wait: int = 120
|
||||
timeout: int | None = None
|
||||
max_message_chars: int = 10_000 # maximum number of characters in an observation's content when sent to the llm
|
||||
temperature: float = 0
|
||||
top_p: float = 0.5
|
||||
temperature: float = 0.0
|
||||
top_p: float = 1.0
|
||||
custom_llm_provider: str | None = None
|
||||
max_input_tokens: int | None = None
|
||||
max_output_tokens: int | None = None
|
||||
|
||||
@@ -117,11 +117,14 @@ class SensitiveDataFilter(logging.Filter):
|
||||
return True
|
||||
|
||||
|
||||
def get_console_handler(log_level=logging.INFO):
|
||||
def get_console_handler(log_level=logging.INFO, extra_info: str | None = None):
|
||||
"""Returns a console handler for logging."""
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(log_level)
|
||||
console_handler.setFormatter(console_formatter)
|
||||
formatter_str = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
if extra_info:
|
||||
formatter_str = f'{extra_info} - ' + formatter_str
|
||||
console_handler.setFormatter(logging.Formatter(formatter_str))
|
||||
return console_handler
|
||||
|
||||
|
||||
|
||||
@@ -511,94 +511,96 @@ def _edit_file_impl(
|
||||
# because the env var will be set AFTER the agentskills is imported
|
||||
if enable_auto_lint:
|
||||
# BACKUP the original file
|
||||
original_file_backup_path = os.path.join(
|
||||
os.path.dirname(file_name),
|
||||
f'.backup.{os.path.basename(file_name)}',
|
||||
)
|
||||
with open(original_file_backup_path, 'w') as f:
|
||||
f.writelines(lines)
|
||||
|
||||
lint_error, first_error_line = _lint_file(file_name)
|
||||
|
||||
# Select the errors caused by the modification
|
||||
def extract_last_part(line):
|
||||
parts = line.split(':')
|
||||
if len(parts) > 1:
|
||||
return parts[-1].strip()
|
||||
return line.strip()
|
||||
|
||||
def subtract_strings(str1, str2) -> str:
|
||||
lines1 = str1.splitlines()
|
||||
lines2 = str2.splitlines()
|
||||
|
||||
last_parts1 = [extract_last_part(line) for line in lines1]
|
||||
|
||||
remaining_lines = [
|
||||
line
|
||||
for line in lines2
|
||||
if extract_last_part(line) not in last_parts1
|
||||
]
|
||||
|
||||
result = '\n'.join(remaining_lines)
|
||||
return result
|
||||
|
||||
if original_lint_error and lint_error:
|
||||
lint_error = subtract_strings(original_lint_error, lint_error)
|
||||
if lint_error == '':
|
||||
lint_error = None
|
||||
first_error_line = None
|
||||
|
||||
if lint_error is not None:
|
||||
if first_error_line is not None:
|
||||
show_line = int(first_error_line)
|
||||
elif is_append:
|
||||
# original end-of-file
|
||||
show_line = len(lines)
|
||||
# insert OR edit WILL provide meaningful line numbers
|
||||
elif start is not None and end is not None:
|
||||
show_line = int((start + end) / 2)
|
||||
else:
|
||||
raise ValueError('Invalid state. This should never happen.')
|
||||
|
||||
ret_str += LINTER_ERROR_MSG
|
||||
ret_str += lint_error + '\n'
|
||||
|
||||
editor_lines = n_added_lines + 20
|
||||
sep = '-' * 49 + '\n'
|
||||
ret_str += (
|
||||
f'[This is how your edit would have looked if applied]\n{sep}'
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
original_file_backup_path = os.path.join(
|
||||
temp_dir,
|
||||
f'.backup.{os.path.basename(file_name)}',
|
||||
)
|
||||
ret_str += (
|
||||
_print_window(file_name, show_line, editor_lines, return_str=True)
|
||||
+ '\n'
|
||||
)
|
||||
ret_str += f'{sep}\n'
|
||||
with open(original_file_backup_path, 'w') as f:
|
||||
f.writelines(lines)
|
||||
|
||||
ret_str += '[This is the original code before your edit]\n'
|
||||
ret_str += sep
|
||||
ret_str += (
|
||||
_print_window(
|
||||
original_file_backup_path,
|
||||
show_line,
|
||||
editor_lines,
|
||||
return_str=True,
|
||||
lint_error, first_error_line = _lint_file(file_name)
|
||||
|
||||
# Select the errors caused by the modification
|
||||
def extract_last_part(line):
|
||||
parts = line.split(':')
|
||||
if len(parts) > 1:
|
||||
return parts[-1].strip()
|
||||
return line.strip()
|
||||
|
||||
def subtract_strings(str1, str2) -> str:
|
||||
lines1 = str1.splitlines()
|
||||
lines2 = str2.splitlines()
|
||||
|
||||
last_parts1 = [extract_last_part(line) for line in lines1]
|
||||
|
||||
remaining_lines = [
|
||||
line
|
||||
for line in lines2
|
||||
if extract_last_part(line) not in last_parts1
|
||||
]
|
||||
|
||||
result = '\n'.join(remaining_lines)
|
||||
return result
|
||||
|
||||
if original_lint_error and lint_error:
|
||||
lint_error = subtract_strings(original_lint_error, lint_error)
|
||||
if lint_error == '':
|
||||
lint_error = None
|
||||
first_error_line = None
|
||||
|
||||
if lint_error is not None:
|
||||
if first_error_line is not None:
|
||||
show_line = int(first_error_line)
|
||||
elif is_append:
|
||||
# original end-of-file
|
||||
show_line = len(lines)
|
||||
# insert OR edit WILL provide meaningful line numbers
|
||||
elif start is not None and end is not None:
|
||||
show_line = int((start + end) / 2)
|
||||
else:
|
||||
raise ValueError('Invalid state. This should never happen.')
|
||||
|
||||
ret_str += LINTER_ERROR_MSG
|
||||
ret_str += lint_error + '\n'
|
||||
|
||||
editor_lines = n_added_lines + 20
|
||||
sep = '-' * 49 + '\n'
|
||||
ret_str += (
|
||||
f'[This is how your edit would have looked if applied]\n{sep}'
|
||||
)
|
||||
+ '\n'
|
||||
)
|
||||
ret_str += sep
|
||||
ret_str += (
|
||||
'Your changes have NOT been applied. Please fix your edit command and try again.\n'
|
||||
'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
|
||||
'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
|
||||
)
|
||||
ret_str += (
|
||||
_print_window(
|
||||
file_name, show_line, editor_lines, return_str=True
|
||||
)
|
||||
+ '\n'
|
||||
)
|
||||
ret_str += f'{sep}\n'
|
||||
|
||||
# recover the original file
|
||||
with open(original_file_backup_path) as fin, open(
|
||||
file_name, 'w'
|
||||
) as fout:
|
||||
fout.write(fin.read())
|
||||
os.remove(original_file_backup_path)
|
||||
return ret_str
|
||||
ret_str += '[This is the original code before your edit]\n'
|
||||
ret_str += sep
|
||||
ret_str += (
|
||||
_print_window(
|
||||
original_file_backup_path,
|
||||
show_line,
|
||||
editor_lines,
|
||||
return_str=True,
|
||||
)
|
||||
+ '\n'
|
||||
)
|
||||
ret_str += sep
|
||||
ret_str += (
|
||||
'Your changes have NOT been applied. Please fix your edit command and try again.\n'
|
||||
'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
|
||||
'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
|
||||
)
|
||||
|
||||
# recover the original file
|
||||
with open(original_file_backup_path) as fin, open(
|
||||
file_name, 'w'
|
||||
) as fout:
|
||||
fout.write(fin.read())
|
||||
return ret_str
|
||||
|
||||
except FileNotFoundError as e:
|
||||
ret_str += f'File not found: {e}\n'
|
||||
|
||||
@@ -59,7 +59,7 @@ class RemoteRuntime(Runtime):
|
||||
self.config = config
|
||||
if self.config.sandbox.api_hostname == 'localhost':
|
||||
self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
|
||||
logger.warning(
|
||||
logger.info(
|
||||
'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
|
||||
'Setting it to default value: api.all-hands.dev/v0/runtime'
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user