Compare commits

...

56 Commits

Author SHA1 Message Date
Xingyao Wang
932af1af7f change to imap_unordered 2024-09-24 20:33:04 +00:00
Xingyao Wang
b7d5ef2c7a try fix mp again 2024-09-24 20:33:01 +00:00
Xingyao Wang
6bced445eb use mp Pool instead ProcessPoolExecutor 2024-09-24 17:34:58 +00:00
Xingyao Wang
68e9914238 convert warning to default for remoteruntime 2024-09-24 17:34:56 +00:00
Xingyao Wang
76b56af656 improve logging for eval: log warning and above directly to console 2024-09-24 17:34:52 +00:00
Xingyao Wang
62ef5ba54e Merge commit '6c6ebbdc58b3950d01bdb791703955d3d255a93d' into eval/24-sep-exp 2024-09-21 02:32:05 +00:00
Xingyao Wang
6c6ebbdc58 update temperature in config toml 2024-09-20 22:31:32 -04:00
Xingyao Wang
36984f15be set temperature back to 0 2024-09-20 22:29:56 -04:00
Xingyao Wang
bdef074e31 make codeact take the temp from config 2024-09-20 22:29:40 -04:00
Xingyao Wang
4fe97b7a2d update config.toml 2024-09-20 21:52:53 -04:00
Xingyao Wang
4cc4004d44 change top_p & temperature default value to 1.0 2024-09-20 20:42:45 -04:00
Xingyao Wang
b06a5a6a00 Merge commit 'd006a6101e65a6020b728ea2ac9cefe30c10b549' into eval/24-sep-exp 2024-09-20 19:57:37 +00:00
Xingyao Wang
d006a6101e use temp dir 2024-09-20 15:50:36 -04:00
Xingyao Wang
22bc1a80e1 Merge commit 'b24a7821ec5dbafd826d204c2f1197f8d0eb19e2' into eval/24-sep-exp 2024-09-20 19:18:19 +00:00
Xingyao Wang
b24a7821ec [eval] fix evaluation git patch post-processing (#3979) 2024-09-20 22:55:43 +08:00
Xingyao Wang
879f9f31e2 fix backup edit 2024-09-19 16:09:57 +00:00
Xingyao Wang
51c6ce398d support run infer in multiple runs 2024-09-18 19:59:48 +00:00
Xingyao Wang
5b7e4c52c8 remove budget constraint 2024-09-18 16:33:44 +00:00
Xingyao Wang
caa0f03c7b Merge commit 'e0f91f2aef053e8ae5c8f78539f086a01346c10e' into eval/24-sep 2024-09-18 16:01:49 +00:00
Xingyao Wang
e0f91f2aef Update evaluation/swe_bench/eval_infer.py
Co-authored-by: Graham Neubig <neubig@gmail.com>
2024-09-18 22:36:57 +08:00
Xingyao Wang
5d1355ffa0 Update evaluation/swe_bench/README.md
Co-authored-by: Graham Neubig <neubig@gmail.com>
2024-09-18 22:36:50 +08:00
Xingyao Wang
4c3068c711 Merge branch 'main' into xw/eval-swebench 2024-09-18 08:40:07 -05:00
Xingyao Wang
68b2152942 update output 2024-09-18 13:34:51 +00:00
Xingyao Wang
b7416a4723 print retry time as well 2024-09-18 01:46:43 +00:00
Xingyao Wang
770af8d74b Revert "bump timeout"
This reverts commit c92cbbb201.
2024-09-17 22:29:15 +00:00
Xingyao Wang
090f0df452 only increase timeout for /alive 2024-09-17 22:29:01 +00:00
Xingyao Wang
c92cbbb201 bump timeout 2024-09-17 22:25:51 +00:00
Xingyao Wang
ee37af93a1 sleep longer for eval retry 2024-09-17 20:42:11 +00:00
Xingyao Wang
e09e8b4ebf improve runtime cleanup script 2024-09-17 19:26:41 +00:00
Xingyao Wang
b96d798efa fix reset logger for n-p=1 2024-09-17 19:18:58 +00:00
Xingyao Wang
9a9d376772 save infer logs as well 2024-09-17 15:46:50 +00:00
Xingyao Wang
9e2a693ed4 save relavant info; remove extra logging 2024-09-17 15:43:30 +00:00
Xingyao Wang
cc3c34c90a fix eval 2024-09-17 15:40:07 +00:00
Xingyao Wang
279443a563 fix missing log path 2024-09-17 15:06:31 +00:00
Xingyao Wang
8a9d9576a9 use polling to get updates to avoid timeout 2024-09-17 15:03:26 +00:00
Xingyao Wang
79867629db Merge commit '963f0db6ab7b24a2f45a2692aa948f190d49cac6' into xw/eval-swebench 2024-09-17 14:50:42 +00:00
Xingyao Wang
963f0db6ab Update evaluation/utils/shared.py
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2024-09-17 21:42:28 +08:00
Xingyao Wang
4e93a24e44 Update evaluation/utils/shared.py
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2024-09-17 21:42:20 +08:00
Xingyao Wang
20722da8ca update output filename 2024-09-17 02:08:54 +00:00
Xingyao Wang
b02c98f683 add download_gold_patch 2024-09-17 02:08:32 +00:00
Xingyao Wang
44b5bffd34 fix copy_to 2024-09-17 02:08:18 +00:00
Xingyao Wang
b720eceb59 fix eval_infer command 2024-09-17 02:00:00 +00:00
Xingyao Wang
fb6da23220 set max retries to one for eval_infer 2024-09-17 01:39:32 +00:00
Xingyao Wang
d843fb8bab Merge commit '33c5cdeb9365ca1d7a9dba92c3476dde951ff5c4' into xw/eval-swebench 2024-09-17 01:39:12 +00:00
Xingyao Wang
33c5cdeb93 remove EvalError and allow passing max_retries 2024-09-17 01:39:04 +00:00
Xingyao Wang
460aa3acbd only dump keys that exists 2024-09-17 01:37:52 +00:00
Xingyao Wang
4ae8f9cf05 stop print the exact patch 2024-09-17 01:35:32 +00:00
Xingyao Wang
2c7b214a74 print final number 2024-09-17 01:34:55 +00:00
Xingyao Wang
283ef9becc fix metadata dump 2024-09-17 01:32:09 +00:00
Xingyao Wang
369ceecc63 support evaluate via remote runtime 2024-09-17 01:24:33 +00:00
Xingyao Wang
fe5a67e96d Merge branch 'main' into xw/eval-fix 2024-09-16 20:15:34 -05:00
Xingyao Wang
cf5da84b6f increase timeout for instance entry 2024-09-16 22:23:59 +00:00
Xingyao Wang
a314309b57 Merge commit 'a42cc05481b68cb6c1306becb3f7b885667dbf04' into xw/eval-swebench 2024-09-16 21:11:39 +00:00
Xingyao Wang
a42cc05481 only update progress on main loop 2024-09-16 21:01:58 +00:00
Xingyao Wang
e0cdaa2a58 allow set EXP_NAME when run_infer.sh 2024-09-16 20:59:13 +00:00
Xingyao Wang
5fa8fde2f0 [eval] simplify eval error & retry again 2024-09-16 20:58:59 +00:00
9 changed files with 150 additions and 126 deletions

View File

@@ -204,7 +204,7 @@ class CodeActAgent(Agent):
'</execute_bash>',
'</execute_browse>',
],
'temperature': 0.0,
'temperature': self.llm.config.temperature,
}
if self.llm.is_caching_prompt_active():

View File

@@ -159,7 +159,7 @@ model = "gpt-4o"
#timeout = 0
# Top p for the API
#top_p = 0.5
#top_p = 1.0
# If model is vision capable, this option allows to disable image processing (useful for cost reduction).
#disable_vision = true

View File

@@ -120,7 +120,6 @@ def get_config(
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
max_budget_per_task=4,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'eventstream'),
sandbox=SandboxConfig(

View File

@@ -11,6 +11,7 @@ MAX_ITER=$5
NUM_WORKERS=$6
DATASET=$7
SPLIT=$8
N_RUNS=$9
if [ -z "$NUM_WORKERS" ]; then
NUM_WORKERS=1
@@ -73,22 +74,38 @@ echo "EVAL_NOTE: $EVAL_NOTE"
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations $MAX_ITER \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE \
--dataset $DATASET \
--split $SPLIT"
run_inference() {
local run_eval_note=$1
echo "RUN_EVAL_NOTE: $run_eval_note"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
local command="poetry run python evaluation/swe_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations $MAX_ITER \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $run_eval_note \
--dataset $DATASET \
--split $SPLIT"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
command="$command --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $command
}
if [ -n "$N_RUNS" ]; then
echo "Running the same experiment $N_RUNS times and save results to different directories"
for i in $(seq 1 $N_RUNS); do
RUN_EVAL_NOTE="$EVAL_NOTE-run_$i"
echo "Running iteration $i of $N_RUNS"
run_inference "$RUN_EVAL_NOTE"
done
else
run_inference "$EVAL_NOTE"
fi
# Run the command
eval $COMMAND
checkout_original_branch

View File

@@ -6,7 +6,6 @@ import pathlib
import subprocess
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Any, Awaitable, Callable, TextIO
import pandas as pd
@@ -297,6 +296,11 @@ def _process_instance_wrapper(
time.sleep(5)
def _process_instance_wrapper_mp(args):
"""Wrapper for multiprocessing, especially for imap_unordered."""
return _process_instance_wrapper(*args)
def run_evaluation(
dataset: pd.DataFrame,
metadata: EvalMetadata | None,
@@ -323,20 +327,13 @@ def run_evaluation(
try:
if use_multiprocessing:
with ProcessPoolExecutor(num_workers) as executor:
futures = [
executor.submit(
_process_instance_wrapper,
process_instance_func=process_instance_func,
instance=instance,
metadata=metadata,
use_mp=True,
max_retries=max_retries,
)
with mp.Pool(num_workers) as pool:
args_iter = (
(process_instance_func, instance, metadata, True, max_retries)
for _, instance in dataset.iterrows()
]
for future in as_completed(futures):
result = future.result()
)
results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
for result in results:
update_progress(result, pbar, output_fp)
else:
for _, instance in dataset.iterrows():
@@ -373,18 +370,24 @@ def reset_logger_for_multiprocessing(
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
# add console handler to print ONE line
console_handler = get_console_handler(
log_level=logging.INFO, extra_info=f'Instance {instance_id}'
)
logger.addHandler(console_handler)
logger.info(
f'Starting evaluation for instance {instance_id}.\n'
f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# Only log WARNING or higher to console
console_handler.setLevel(logging.WARNING)
# Log INFO and above to file
os.makedirs(os.path.dirname(log_file), exist_ok=True)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)

View File

@@ -71,8 +71,8 @@ class LLMConfig:
retry_max_wait: int = 120
timeout: int | None = None
max_message_chars: int = 10_000 # maximum number of characters in an observation's content when sent to the llm
temperature: float = 0
top_p: float = 0.5
temperature: float = 0.0
top_p: float = 1.0
custom_llm_provider: str | None = None
max_input_tokens: int | None = None
max_output_tokens: int | None = None

View File

@@ -117,11 +117,14 @@ class SensitiveDataFilter(logging.Filter):
return True
def get_console_handler(log_level=logging.INFO):
def get_console_handler(log_level=logging.INFO, extra_info: str | None = None):
"""Returns a console handler for logging."""
console_handler = logging.StreamHandler()
console_handler.setLevel(log_level)
console_handler.setFormatter(console_formatter)
formatter_str = '%(asctime)s - %(levelname)s - %(message)s'
if extra_info:
formatter_str = f'{extra_info} - ' + formatter_str
console_handler.setFormatter(logging.Formatter(formatter_str))
return console_handler

View File

@@ -511,94 +511,96 @@ def _edit_file_impl(
# because the env var will be set AFTER the agentskills is imported
if enable_auto_lint:
# BACKUP the original file
original_file_backup_path = os.path.join(
os.path.dirname(file_name),
f'.backup.{os.path.basename(file_name)}',
)
with open(original_file_backup_path, 'w') as f:
f.writelines(lines)
lint_error, first_error_line = _lint_file(file_name)
# Select the errors caused by the modification
def extract_last_part(line):
parts = line.split(':')
if len(parts) > 1:
return parts[-1].strip()
return line.strip()
def subtract_strings(str1, str2) -> str:
lines1 = str1.splitlines()
lines2 = str2.splitlines()
last_parts1 = [extract_last_part(line) for line in lines1]
remaining_lines = [
line
for line in lines2
if extract_last_part(line) not in last_parts1
]
result = '\n'.join(remaining_lines)
return result
if original_lint_error and lint_error:
lint_error = subtract_strings(original_lint_error, lint_error)
if lint_error == '':
lint_error = None
first_error_line = None
if lint_error is not None:
if first_error_line is not None:
show_line = int(first_error_line)
elif is_append:
# original end-of-file
show_line = len(lines)
# insert OR edit WILL provide meaningful line numbers
elif start is not None and end is not None:
show_line = int((start + end) / 2)
else:
raise ValueError('Invalid state. This should never happen.')
ret_str += LINTER_ERROR_MSG
ret_str += lint_error + '\n'
editor_lines = n_added_lines + 20
sep = '-' * 49 + '\n'
ret_str += (
f'[This is how your edit would have looked if applied]\n{sep}'
with tempfile.TemporaryDirectory() as temp_dir:
original_file_backup_path = os.path.join(
temp_dir,
f'.backup.{os.path.basename(file_name)}',
)
ret_str += (
_print_window(file_name, show_line, editor_lines, return_str=True)
+ '\n'
)
ret_str += f'{sep}\n'
with open(original_file_backup_path, 'w') as f:
f.writelines(lines)
ret_str += '[This is the original code before your edit]\n'
ret_str += sep
ret_str += (
_print_window(
original_file_backup_path,
show_line,
editor_lines,
return_str=True,
lint_error, first_error_line = _lint_file(file_name)
# Select the errors caused by the modification
def extract_last_part(line):
parts = line.split(':')
if len(parts) > 1:
return parts[-1].strip()
return line.strip()
def subtract_strings(str1, str2) -> str:
lines1 = str1.splitlines()
lines2 = str2.splitlines()
last_parts1 = [extract_last_part(line) for line in lines1]
remaining_lines = [
line
for line in lines2
if extract_last_part(line) not in last_parts1
]
result = '\n'.join(remaining_lines)
return result
if original_lint_error and lint_error:
lint_error = subtract_strings(original_lint_error, lint_error)
if lint_error == '':
lint_error = None
first_error_line = None
if lint_error is not None:
if first_error_line is not None:
show_line = int(first_error_line)
elif is_append:
# original end-of-file
show_line = len(lines)
# insert OR edit WILL provide meaningful line numbers
elif start is not None and end is not None:
show_line = int((start + end) / 2)
else:
raise ValueError('Invalid state. This should never happen.')
ret_str += LINTER_ERROR_MSG
ret_str += lint_error + '\n'
editor_lines = n_added_lines + 20
sep = '-' * 49 + '\n'
ret_str += (
f'[This is how your edit would have looked if applied]\n{sep}'
)
+ '\n'
)
ret_str += sep
ret_str += (
'Your changes have NOT been applied. Please fix your edit command and try again.\n'
'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
)
ret_str += (
_print_window(
file_name, show_line, editor_lines, return_str=True
)
+ '\n'
)
ret_str += f'{sep}\n'
# recover the original file
with open(original_file_backup_path) as fin, open(
file_name, 'w'
) as fout:
fout.write(fin.read())
os.remove(original_file_backup_path)
return ret_str
ret_str += '[This is the original code before your edit]\n'
ret_str += sep
ret_str += (
_print_window(
original_file_backup_path,
show_line,
editor_lines,
return_str=True,
)
+ '\n'
)
ret_str += sep
ret_str += (
'Your changes have NOT been applied. Please fix your edit command and try again.\n'
'You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code.\n'
'DO NOT re-run the same failed edit command. Running it again will lead to the same error.'
)
# recover the original file
with open(original_file_backup_path) as fin, open(
file_name, 'w'
) as fout:
fout.write(fin.read())
return ret_str
except FileNotFoundError as e:
ret_str += f'File not found: {e}\n'

View File

@@ -59,7 +59,7 @@ class RemoteRuntime(Runtime):
self.config = config
if self.config.sandbox.api_hostname == 'localhost':
self.config.sandbox.api_hostname = 'api.all-hands.dev/v0/runtime'
logger.warning(
logger.info(
'Using localhost as the API hostname is not supported in the RemoteRuntime. Please set a proper hostname.\n'
'Setting it to default value: api.all-hands.dev/v0/runtime'
)