mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-08 22:38:05 -05:00
Update pre-commit hook versions to most recent versions (#8343)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
3
.github/scripts/check_version_consistency.py
vendored
3
.github/scripts/check_version_consistency.py
vendored
@@ -2,10 +2,9 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Set, Tuple
|
||||
|
||||
|
||||
def find_version_references(directory: str) -> Tuple[Set[str], Set[str]]:
|
||||
def find_version_references(directory: str) -> tuple[set[str], set[str]]:
|
||||
openhands_versions = set()
|
||||
runtime_versions = set()
|
||||
|
||||
|
||||
0
.openhands/pre-commit.sh
Normal file → Executable file
0
.openhands/pre-commit.sh
Normal file → Executable file
@@ -1,6 +1,6 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.5.0
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
exclude: docs/modules/python
|
||||
@@ -10,17 +10,17 @@ repos:
|
||||
- id: debug-statements
|
||||
|
||||
- repo: https://github.com/tox-dev/pyproject-fmt
|
||||
rev: 1.7.0
|
||||
rev: v2.5.1
|
||||
hooks:
|
||||
- id: pyproject-fmt
|
||||
- repo: https://github.com/abravalheri/validate-pyproject
|
||||
rev: v0.16
|
||||
rev: v0.24.1
|
||||
hooks:
|
||||
- id: validate-pyproject
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
# Ruff version.
|
||||
rev: v0.4.1
|
||||
rev: v0.11.8
|
||||
hooks:
|
||||
# Run the linter.
|
||||
- id: ruff
|
||||
@@ -33,7 +33,7 @@ repos:
|
||||
types_or: [python, pyi, jupyter]
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.9.0
|
||||
rev: v1.15.0
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
|
||||
@@ -20,6 +20,12 @@ ignore = [
|
||||
"B010",
|
||||
"B904",
|
||||
"B018",
|
||||
# Temporarily ignore ASYNC rules until they can be properly fixed in a separate PR
|
||||
"ASYNC110",
|
||||
"ASYNC220",
|
||||
"ASYNC221",
|
||||
"ASYNC230",
|
||||
"ASYNC251",
|
||||
]
|
||||
|
||||
[lint.flake8-quotes]
|
||||
|
||||
@@ -73,7 +73,7 @@ class Q20Game:
|
||||
usr_msg = self.answerer(guesser_question)
|
||||
|
||||
self.guesser_messages.append(
|
||||
{'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
|
||||
{'role': 'user', 'content': f'{usr_msg["content"].strip()}'}
|
||||
)
|
||||
|
||||
if 'bingo' in usr_msg['content'].lower():
|
||||
|
||||
@@ -67,7 +67,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -100,7 +100,7 @@ def initialize_runtime(
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -113,7 +113,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
agent_answer = None
|
||||
@@ -165,7 +165,7 @@ def complete_runtime(
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
final_ans = obs.content
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {
|
||||
'final_ans': final_ans,
|
||||
'agent_answer': agent_answer,
|
||||
|
||||
@@ -80,7 +80,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
|
||||
logger.info(f'\n{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}\n')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -110,7 +110,7 @@ def initialize_runtime(
|
||||
file_path,
|
||||
'/workspace',
|
||||
)
|
||||
logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
|
||||
logger.info(f'\n{"-" * 50} END Runtime Initialization Fn {"-" * 50}\n')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -123,7 +123,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
|
||||
logger.info(f'\n{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}\n')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Rewriting the test file to ignore any changes Agent may have made.
|
||||
@@ -147,7 +147,7 @@ def complete_runtime(
|
||||
if isinstance(obs, CmdOutputObservation):
|
||||
exit_code = obs.exit_code
|
||||
|
||||
logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
|
||||
logger.info(f'\n{"-" * 50} END Runtime Completion Fn {"-" * 50}\n')
|
||||
|
||||
runtime.close()
|
||||
|
||||
|
||||
@@ -84,7 +84,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
file_ext = FILE_EXT_MAP[instance.language.lower()]
|
||||
@@ -128,7 +128,7 @@ def initialize_runtime(
|
||||
assert obs.exit_code == 0
|
||||
|
||||
# download repository archive
|
||||
repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
|
||||
repository_url = f'https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split("/")[1]}.zip'
|
||||
action = CmdRunAction(command='wget -O repo.zip ' + repository_url)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
@@ -160,7 +160,7 @@ def initialize_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}'
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -173,7 +173,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
test_result = {'result': {}, 'metadata': {}}
|
||||
@@ -233,7 +233,7 @@ def complete_runtime(
|
||||
test_result['metadata']['2_run_test_success'] = False
|
||||
test_result['metadata']['2_run_test_result'] = str(obs.content)
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return test_result
|
||||
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ def remove_code(target_filepath: str, line_start: int, line_end: int, language:
|
||||
lines = (
|
||||
lines[:line_start]
|
||||
+ [
|
||||
f"{' '*comment_indent_size+comment_prefix[language.lower()]}TODO: replace with your code here"
|
||||
f'{" " * comment_indent_size + comment_prefix[language.lower()]}TODO: replace with your code here'
|
||||
]
|
||||
+ ([''] * 2)
|
||||
+ lines[line_end:]
|
||||
|
||||
@@ -184,7 +184,7 @@ def load_bird():
|
||||
.fetchall()
|
||||
)
|
||||
prompt += (
|
||||
f"/*\n3 example rows:\n{top_k_row_query}\n{' '.join(headers)}\n"
|
||||
f'/*\n3 example rows:\n{top_k_row_query}\n{" ".join(headers)}\n'
|
||||
)
|
||||
for row in top_k_rows:
|
||||
row = [str(x) for x in row]
|
||||
@@ -201,10 +201,10 @@ def load_bird():
|
||||
|
||||
# Extract the CREATE TABLE statements and sample data from the database
|
||||
prompt = _extract_create_table_prompt(db_path)
|
||||
prompt += f"-- External Knowledge: {e['evidence']}\n\n"
|
||||
prompt += f'-- External Knowledge: {e["evidence"]}\n\n'
|
||||
prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
|
||||
prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
|
||||
prompt += f"Question: {e['question']}\n"
|
||||
prompt += f'Question: {e["question"]}\n'
|
||||
|
||||
return prompt
|
||||
|
||||
@@ -224,7 +224,7 @@ def load_bird():
|
||||
item = {
|
||||
'instance_id': f'{len(processed_data)}',
|
||||
'db_path': os.path.join(
|
||||
database_path, e['db_id'], f"{e['db_id']}.sqlite"
|
||||
database_path, e['db_id'], f'{e["db_id"]}.sqlite'
|
||||
),
|
||||
'db_id': e['db_id'],
|
||||
'instruction': _create_prompt(e, database_path),
|
||||
@@ -253,7 +253,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Copy the database to the workspace
|
||||
@@ -273,7 +273,7 @@ def initialize_runtime(
|
||||
assert obs.exit_code == 0
|
||||
assert f'{instance.db_id}.sqlite' in obs.content
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -286,7 +286,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
timeout = 30
|
||||
|
||||
@@ -343,7 +343,7 @@ def complete_runtime(
|
||||
'gen_sql': gen_sql,
|
||||
'gold_sql': gold_sql,
|
||||
}
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return test_result
|
||||
|
||||
|
||||
|
||||
@@ -34,9 +34,9 @@ SUPPORTED_AGENT_CLS = {'CodeActAgent'}
|
||||
def get_config(
|
||||
metadata: EvalMetadata,
|
||||
) -> AppConfig:
|
||||
assert (
|
||||
metadata.max_iterations == 1
|
||||
), 'max_iterations must be 1 for browsing delegation evaluation.'
|
||||
assert metadata.max_iterations == 1, (
|
||||
'max_iterations must be 1 for browsing delegation evaluation.'
|
||||
)
|
||||
sandbox_config = get_default_sandbox_config_for_eval()
|
||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||
config = AppConfig(
|
||||
|
||||
@@ -82,9 +82,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
|
||||
if RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\n'
|
||||
'You SHOULD NEVER attempt to browse the web. '
|
||||
'</IMPORTANT!>\n'
|
||||
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
||||
)
|
||||
return instruction
|
||||
|
||||
@@ -265,7 +263,7 @@ def complete_runtime(
|
||||
|
||||
test_dir = instance['test']['test_dir']
|
||||
action = CmdRunAction(
|
||||
command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
|
||||
command=f'{instance["test"]["test_cmd"]} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1'
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
|
||||
@@ -489,7 +489,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
|
||||
gen_subh_to_gold_subh[p_id] = g_id
|
||||
gold_subh_covered.append(g_id)
|
||||
gen_gold_subh_to_context[f'P{p_id}||G{g_id}'] = {
|
||||
'question': f"""Comapring: GoldH: {gold_subh["text"]}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""",
|
||||
'question': f"""Comapring: GoldH: {gold_subh['text']}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""",
|
||||
'answer': context_bool,
|
||||
'score': context_score,
|
||||
}
|
||||
|
||||
@@ -145,7 +145,7 @@ def initialize_runtime(runtime: Runtime, data_files: list[str]):
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = CmdRunAction(command='mkdir -p /workspace')
|
||||
@@ -170,7 +170,7 @@ def initialize_runtime(runtime: Runtime, data_files: list[str]):
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def get_last_agent_finish_action(state: State) -> AgentFinishAction:
|
||||
|
||||
@@ -78,7 +78,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = CmdRunAction(command='mkdir -p /workspace')
|
||||
@@ -110,7 +110,7 @@ def initialize_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def process_instance(
|
||||
@@ -134,10 +134,10 @@ def process_instance(
|
||||
dest_file = None
|
||||
|
||||
# Prepare instruction
|
||||
instruction = f"{instance['Question']}\n"
|
||||
instruction = f'{instance["Question"]}\n'
|
||||
logger.info(f'Instruction: {instruction}')
|
||||
if dest_file:
|
||||
instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
|
||||
instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'
|
||||
|
||||
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
|
||||
@@ -21,7 +21,7 @@ def split_string(
|
||||
) -> list[str]:
|
||||
if char_list is None:
|
||||
char_list = [',', ';']
|
||||
pattern = f"[{''.join(char_list)}]"
|
||||
pattern = f'[{"".join(char_list)}]'
|
||||
return re.split(pattern, s)
|
||||
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = CmdRunAction(command='mkdir -p /workspace')
|
||||
@@ -143,7 +143,7 @@ def initialize_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -156,7 +156,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# default value
|
||||
@@ -190,7 +190,7 @@ def complete_runtime(
|
||||
'timeout': timeout,
|
||||
'num_workers': num_workers,
|
||||
}
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return test_result
|
||||
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ def run_eval(
|
||||
runtime: Runtime,
|
||||
):
|
||||
"""Run the evaluation and create report"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
lca_path = bench_config['LCA_PATH']
|
||||
@@ -146,7 +146,7 @@ def run_eval(
|
||||
obs = runtime.run_action(action)
|
||||
report_str = obs.content
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
return report_str
|
||||
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
lca_path = bench_config['LCA_PATH']
|
||||
@@ -177,7 +177,7 @@ def initialize_runtime(
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -190,7 +190,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
model_name = bench_config['model_name']
|
||||
@@ -227,7 +227,7 @@ def complete_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
result = json.loads(obs.content)
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
|
||||
return result
|
||||
|
||||
@@ -313,7 +313,7 @@ Phase 7. VERIFICATION: Test your implementation thoroughly.
|
||||
7.2.3 The functions you changed
|
||||
7.4 If any tests fail, revise your implementation until all tests pass
|
||||
|
||||
Phase 8. REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance["sha_fail"]}.
|
||||
Phase 8. REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['sha_fail']}.
|
||||
8.1 Ensure you've fully addressed all requirements.
|
||||
|
||||
Once all phases are done, announce: 'Agent Task Complete'.
|
||||
|
||||
@@ -141,7 +141,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -174,7 +174,7 @@ def initialize_runtime(
|
||||
ipynb_obs = runtime.run_action(action)
|
||||
logger.info(ipynb_obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
# Prepare instruction
|
||||
|
||||
@@ -82,7 +82,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -103,7 +103,7 @@ def initialize_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
return goal, obs
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
|
||||
@@ -124,7 +124,7 @@ def complete_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {
|
||||
'rewards': json.loads(obs.content),
|
||||
}
|
||||
|
||||
@@ -130,7 +130,7 @@ def initialize_runtime(runtime: Runtime):
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -144,7 +144,7 @@ def initialize_runtime(runtime: Runtime):
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def process_instance(
|
||||
|
||||
@@ -93,7 +93,7 @@ def classify_error(llm: LLM, failed_case: dict) -> str:
|
||||
error_category = response.choices[0].message['content']
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to classify the error for the failed case: {failed_case['instance_id']}"
|
||||
f'Failed to classify the error for the failed case: {failed_case["instance_id"]}'
|
||||
)
|
||||
logger.error(e)
|
||||
error_category = input(
|
||||
|
||||
@@ -103,7 +103,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -137,7 +137,7 @@ def initialize_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
assert obs.exit_code == 0
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -150,7 +150,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
repo_url = instance['github']
|
||||
@@ -199,7 +199,7 @@ def complete_runtime(
|
||||
outputs['success'] = 1
|
||||
outputs['eval_exit_code'] = obs.exit_code
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return outputs
|
||||
|
||||
|
||||
|
||||
@@ -120,9 +120,9 @@ def process_instance(
|
||||
"""
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
assert (
|
||||
log_dir is not None
|
||||
), "Can't reset logger without a provided log directory."
|
||||
assert log_dir is not None, (
|
||||
"Can't reset logger without a provided log directory."
|
||||
)
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
||||
else:
|
||||
@@ -289,7 +289,7 @@ def process_instance(
|
||||
)
|
||||
report = _report[instance_id]
|
||||
logger.info(
|
||||
f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
|
||||
f'[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report["resolved"]}'
|
||||
)
|
||||
instance['test_result']['report']['resolved'] = report[
|
||||
'resolved'
|
||||
@@ -365,9 +365,9 @@ if __name__ == '__main__':
|
||||
for line in tqdm(f, desc='Loading predictions')
|
||||
]
|
||||
)
|
||||
assert (
|
||||
'instance_id' in predictions.columns
|
||||
), 'Input file must contain instance_id column.'
|
||||
assert 'instance_id' in predictions.columns, (
|
||||
'Input file must contain instance_id column.'
|
||||
)
|
||||
|
||||
if 'model_patch' not in predictions.columns and (
|
||||
'test_result' in predictions.columns
|
||||
@@ -376,17 +376,17 @@ if __name__ == '__main__':
|
||||
raise ValueError(
|
||||
'Input file must contain model_patch column OR test_result column with model_patch field.'
|
||||
)
|
||||
assert len(predictions['instance_id'].unique()) == len(
|
||||
predictions
|
||||
), 'instance_id column must be unique.'
|
||||
assert len(predictions['instance_id'].unique()) == len(predictions), (
|
||||
'instance_id column must be unique.'
|
||||
)
|
||||
|
||||
if 'model_patch' not in predictions.columns:
|
||||
predictions['model_patch'] = predictions['test_result'].apply(
|
||||
lambda x: x.get('git_patch', '')
|
||||
)
|
||||
assert {'instance_id', 'model_patch'}.issubset(
|
||||
set(predictions.columns)
|
||||
), 'Input file must contain instance_id and model_patch columns.'
|
||||
assert {'instance_id', 'model_patch'}.issubset(set(predictions.columns)), (
|
||||
'Input file must contain instance_id and model_patch columns.'
|
||||
)
|
||||
|
||||
# Process model_patch
|
||||
predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
|
||||
|
||||
@@ -103,21 +103,21 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
f'<issue_description>\n'
|
||||
f'{instance.problem_statement}\n'
|
||||
'</issue_description>\n\n'
|
||||
"Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n"
|
||||
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
|
||||
"I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
|
||||
"Also the development Java environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
|
||||
"Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n"
|
||||
"Follow these steps to resolve the issue:\n"
|
||||
"1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n"
|
||||
'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
|
||||
'Follow these steps to resolve the issue:\n'
|
||||
'1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
|
||||
'2. Create a Java class to reproduce the error and execute it by first compiling with `javac <classname>.java` and then running with `java <classname>` using the BashTool, to confirm the error\n'
|
||||
"3. Edit the sourcecode of the repo to resolve the issue.\n"
|
||||
"4. Rerun your reproduce script or class and confirm that the error is fixed!\n"
|
||||
"5. Think about edgecases, add comprehensive tests for them in your reproduce class or script, and run them to make sure your fix handles these cases as well.\n"
|
||||
f"6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance['base_commit']}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n"
|
||||
" - The issue you are fixing\n"
|
||||
" - The files you modified\n"
|
||||
" - The functions or classes you changed\n"
|
||||
" Make sure all these tests pass with your changes.\n"
|
||||
'3. Edit the sourcecode of the repo to resolve the issue.\n'
|
||||
'4. Rerun your reproduce script or class and confirm that the error is fixed!\n'
|
||||
'5. Think about edgecases, add comprehensive tests for them in your reproduce class or script, and run them to make sure your fix handles these cases as well.\n'
|
||||
f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
|
||||
' - The issue you are fixing\n'
|
||||
' - The files you modified\n'
|
||||
' - The functions or classes you changed\n'
|
||||
' Make sure all these tests pass with your changes.\n'
|
||||
"Your thinking should be thorough and so it's fine if it's very long.\n"
|
||||
),
|
||||
'go': (
|
||||
@@ -275,9 +275,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
|
||||
if instruction and RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\n'
|
||||
'You SHOULD NEVER attempt to browse the web. '
|
||||
'</IMPORTANT!>\n'
|
||||
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
||||
)
|
||||
return instruction
|
||||
|
||||
|
||||
@@ -3,9 +3,10 @@ import json
|
||||
input_file = 'XXX.jsonl'
|
||||
output_file = 'YYY.jsonl'
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as fin, open(
|
||||
output_file, 'w', encoding='utf-8'
|
||||
) as fout:
|
||||
with (
|
||||
open(input_file, 'r', encoding='utf-8') as fin,
|
||||
open(output_file, 'w', encoding='utf-8') as fout,
|
||||
):
|
||||
for line in fin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
|
||||
@@ -92,7 +92,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set up workspace directories
|
||||
@@ -123,7 +123,7 @@ def initialize_runtime(
|
||||
assert obs.exit_code == 0
|
||||
assert dataset_name in obs.content
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -136,7 +136,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
test_result = {}
|
||||
@@ -156,7 +156,7 @@ def complete_runtime(
|
||||
else:
|
||||
test_result = {'program': 'ERROR'}
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return test_result
|
||||
|
||||
|
||||
|
||||
@@ -129,15 +129,15 @@ def process_instance(
|
||||
|
||||
AssertionError: if `conditional_imports` is not provided.
|
||||
"""
|
||||
assert (
|
||||
conditional_imports is not None
|
||||
), 'conditional_imports must be provided to run process_instance using multiprocessing'
|
||||
assert conditional_imports is not None, (
|
||||
'conditional_imports must be provided to run process_instance using multiprocessing'
|
||||
)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
assert (
|
||||
log_dir is not None
|
||||
), "Can't reset logger without a provided log directory."
|
||||
assert log_dir is not None, (
|
||||
"Can't reset logger without a provided log directory."
|
||||
)
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
||||
else:
|
||||
@@ -319,7 +319,7 @@ def process_instance(
|
||||
)
|
||||
report = _report[instance_id]
|
||||
logger.info(
|
||||
f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
|
||||
f'[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report["resolved"]}'
|
||||
)
|
||||
instance['test_result']['report']['resolved'] = report[
|
||||
'resolved'
|
||||
@@ -418,9 +418,9 @@ if __name__ == '__main__':
|
||||
for line in tqdm(f, desc='Loading predictions')
|
||||
]
|
||||
)
|
||||
assert (
|
||||
'instance_id' in predictions.columns
|
||||
), 'Input file must contain instance_id column.'
|
||||
assert 'instance_id' in predictions.columns, (
|
||||
'Input file must contain instance_id column.'
|
||||
)
|
||||
|
||||
if 'model_patch' not in predictions.columns and (
|
||||
'test_result' in predictions.columns
|
||||
@@ -429,17 +429,17 @@ if __name__ == '__main__':
|
||||
raise ValueError(
|
||||
'Input file must contain model_patch column OR test_result column with model_patch field.'
|
||||
)
|
||||
assert len(predictions['instance_id'].unique()) == len(
|
||||
predictions
|
||||
), 'instance_id column must be unique.'
|
||||
assert len(predictions['instance_id'].unique()) == len(predictions), (
|
||||
'instance_id column must be unique.'
|
||||
)
|
||||
|
||||
if 'model_patch' not in predictions.columns:
|
||||
predictions['model_patch'] = predictions['test_result'].apply(
|
||||
lambda x: x.get('git_patch', '')
|
||||
)
|
||||
assert {'instance_id', 'model_patch'}.issubset(
|
||||
set(predictions.columns)
|
||||
), 'Input file must contain instance_id and model_patch columns.'
|
||||
assert {'instance_id', 'model_patch'}.issubset(set(predictions.columns)), (
|
||||
'Input file must contain instance_id and model_patch columns.'
|
||||
)
|
||||
|
||||
# Process model_patch
|
||||
predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
|
||||
|
||||
@@ -160,7 +160,7 @@ Phase 7. VERIFICATION: Test your implementation thoroughly.
|
||||
7.2 Add edge cases to your test script to ensure comprehensive coverage.
|
||||
7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
|
||||
|
||||
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance["base_commit"]}.
|
||||
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['base_commit']}.
|
||||
8.1 Ensure you've fully addressed all requirements.
|
||||
8.2 Run any tests in the repository related to:
|
||||
8.2.1 The issue you are fixing
|
||||
@@ -173,16 +173,14 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
|
||||
|
||||
if RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\n'
|
||||
'You SHOULD NEVER attempt to browse the web. '
|
||||
'</IMPORTANT!>\n'
|
||||
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
||||
)
|
||||
|
||||
if 'image_assets' in instance:
|
||||
assets = json.loads(instance['image_assets'])
|
||||
assert (
|
||||
'problem_statement' in assets
|
||||
), 'problem_statement is required in image_assets'
|
||||
assert 'problem_statement' in assets, (
|
||||
'problem_statement is required in image_assets'
|
||||
)
|
||||
image_urls = assets['problem_statement']
|
||||
return MessageAction(content=instruction, image_urls=image_urls)
|
||||
return MessageAction(content=instruction)
|
||||
|
||||
@@ -137,7 +137,7 @@ for repo, diff in repo_diffs:
|
||||
is_significant = diff >= threshold
|
||||
repo_color = 'red' if is_significant else 'yellow'
|
||||
|
||||
print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
|
||||
print(f'\n{colored(repo, repo_color, attrs=["bold"])}:')
|
||||
print(
|
||||
colored(
|
||||
f'Difference: {diff} instances! (Larger diff = Y better)',
|
||||
|
||||
@@ -105,12 +105,12 @@ def convert_tool_call_to_string(tool_call: dict) -> str:
|
||||
if tool_call['type'] != 'function':
|
||||
raise ValueError("Tool call type must be 'function'.")
|
||||
|
||||
ret = f"<function={tool_call['function']['name']}>\n"
|
||||
ret = f'<function={tool_call["function"]["name"]}>\n'
|
||||
try:
|
||||
args = json.loads(tool_call['function']['arguments'])
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(
|
||||
f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
|
||||
f'Failed to parse arguments as JSON. Arguments: {tool_call["function"]["arguments"]}'
|
||||
) from e
|
||||
for param_name, param_value in args.items():
|
||||
is_multiline = isinstance(param_value, str) and '\n' in param_value
|
||||
|
||||
@@ -263,38 +263,38 @@ if __name__ == '__main__':
|
||||
# Print detailed results for single file
|
||||
print(f'\nResults for {args.input_path}:')
|
||||
print(
|
||||
f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
|
||||
f'Number of resolved: {result["resolved"]["count"]} / {result["total_instances"]} ({result["resolved"]["percentage"]:.2f}% [{result["resolved"]["ci"][0]:.2f}%, {result["resolved"]["ci"][1]:.2f}%])'
|
||||
)
|
||||
print(
|
||||
f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
|
||||
f'Number of empty patch: {result["empty_patches"]["count"]} / {result["total_instances"]} ({result["empty_patches"]["percentage"]:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
|
||||
f'Number of error lines: {result["errors"]["total"]} / {result["total_instances"]} ({result["errors"]["percentage"]:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
|
||||
f'Number of agent stuck in loop: {result["errors"]["stuck_in_loop"]["count"]} / {result["total_instances"]} ({result["errors"]["stuck_in_loop"]["percentage"]:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
|
||||
f'Number of unfinished runs: {result["unfinished_runs"]["count"]} / {result["total_instances"]} ({result["unfinished_runs"]["percentage"]:.2f}%)'
|
||||
)
|
||||
print(f"Total cost: {result['costs']['total']:.2f} USD")
|
||||
print(f'Total cost: {result["costs"]["total"]:.2f} USD')
|
||||
print('## Statistics')
|
||||
print(
|
||||
f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
|
||||
f'Avg. num of turns per instance: {result["statistics"]["avg_turns"]:.2f}'
|
||||
)
|
||||
print(
|
||||
f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
|
||||
f'Avg. agent cost per instance: {result["statistics"]["costs"]["main_agent"]:.2f} USD'
|
||||
)
|
||||
print(
|
||||
f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
|
||||
f'Avg. editor cost per instance: {result["statistics"]["costs"]["editor"]:.2f} USD'
|
||||
)
|
||||
print(
|
||||
f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
|
||||
f'Avg. total cost per instance: {result["statistics"]["costs"]["total"]:.2f} USD'
|
||||
)
|
||||
|
||||
print('## Detailed error breakdown:')
|
||||
for error, data in result['errors']['breakdown'].items():
|
||||
print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
|
||||
print(f'{error}: {data["count"]} ({data["percentage"]:.2f}%)')
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error processing {args.input_path}: {str(e)}')
|
||||
|
||||
@@ -34,16 +34,16 @@ if os.path.exists(swebench_official_report_json):
|
||||
report = json.load(f)
|
||||
|
||||
output_md = (
|
||||
"# SWE-bench Report\n"
|
||||
"This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
|
||||
"## Summary\n"
|
||||
f"- total instances: {report['total_instances']}\n"
|
||||
f"- submitted instances: {report['submitted_instances']}\n"
|
||||
f"- completed instances: {report['completed_instances']}\n"
|
||||
f"- empty patch instances: {report['empty_patch_instances']}\n"
|
||||
f"- resolved instances: {report['resolved_instances']}\n"
|
||||
f"- unresolved instances: {report['unresolved_instances']}\n"
|
||||
f"- error instances: {report['error_instances']}\n"
|
||||
'# SWE-bench Report\n'
|
||||
'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
|
||||
'## Summary\n'
|
||||
f'- total instances: {report["total_instances"]}\n'
|
||||
f'- submitted instances: {report["submitted_instances"]}\n'
|
||||
f'- completed instances: {report["completed_instances"]}\n'
|
||||
f'- empty patch instances: {report["empty_patch_instances"]}\n'
|
||||
f'- resolved instances: {report["resolved_instances"]}\n'
|
||||
f'- unresolved instances: {report["unresolved_instances"]}\n'
|
||||
f'- error instances: {report["error_instances"]}\n'
|
||||
)
|
||||
|
||||
output_md += '\n## Resolved Instances\n'
|
||||
@@ -111,12 +111,12 @@ elif os.path.exists(openhands_remote_report_jsonl):
|
||||
print(f'Total instances in eval report: {n_eval_instances}')
|
||||
|
||||
# Verify no duplicates
|
||||
assert (
|
||||
len(instance_ids) == n_instances
|
||||
), 'Duplicate instance ids found in original output'
|
||||
assert (
|
||||
len(eval_instance_ids) == n_eval_instances
|
||||
), 'Duplicate instance ids found in eval report'
|
||||
assert len(instance_ids) == n_instances, (
|
||||
'Duplicate instance ids found in original output'
|
||||
)
|
||||
assert len(eval_instance_ids) == n_eval_instances, (
|
||||
'Duplicate instance ids found in eval report'
|
||||
)
|
||||
|
||||
# Initialize counters
|
||||
stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0}
|
||||
@@ -152,7 +152,7 @@ elif os.path.exists(openhands_remote_report_jsonl):
|
||||
|
||||
# Generate markdown report
|
||||
def _instance_id_to_log_path(instance_id):
|
||||
path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
|
||||
path = f'{args.input_file.replace(".jsonl", ".swebench_eval.logs")}/instance_{instance_id}.log'
|
||||
return os.path.relpath(path, start=dirname)
|
||||
|
||||
# ... rest of markdown generation code remains the same ...
|
||||
@@ -228,9 +228,10 @@ if os.path.exists(args.input_file + '.bak'):
|
||||
os.rename(args.input_file, args.input_file + '.bak')
|
||||
|
||||
# Process and write file row by row
|
||||
with open(args.input_file + '.bak', 'r') as infile, open(
|
||||
args.input_file, 'w'
|
||||
) as outfile:
|
||||
with (
|
||||
open(args.input_file + '.bak', 'r') as infile,
|
||||
open(args.input_file, 'w') as outfile,
|
||||
):
|
||||
for line in tqdm(infile, desc='Updating output file'):
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
|
||||
@@ -20,7 +20,7 @@ def verify_instance_costs(row: pd.Series) -> float:
|
||||
try:
|
||||
metrics = row.get('metrics')
|
||||
if not metrics:
|
||||
logger.warning(f"Instance {row['instance_id']}: No metrics found")
|
||||
logger.warning(f'Instance {row["instance_id"]}: No metrics found')
|
||||
return 0.0
|
||||
|
||||
accumulated = metrics.get('accumulated_cost')
|
||||
@@ -28,7 +28,7 @@ def verify_instance_costs(row: pd.Series) -> float:
|
||||
|
||||
if accumulated is None:
|
||||
logger.warning(
|
||||
f"Instance {row['instance_id']}: No accumulated_cost in metrics"
|
||||
f'Instance {row["instance_id"]}: No accumulated_cost in metrics'
|
||||
)
|
||||
return 0.0
|
||||
|
||||
@@ -41,8 +41,8 @@ def verify_instance_costs(row: pd.Series) -> float:
|
||||
if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
|
||||
has_duplicate = True
|
||||
logger.debug(
|
||||
f"Instance {row['instance_id']}: Possible buggy double-counting detected! "
|
||||
f"Steps {i} and {i+1} have identical costs: {costs[i]['cost']:.2f}"
|
||||
f'Instance {row["instance_id"]}: Possible buggy double-counting detected! '
|
||||
f'Steps {i} and {i + 1} have identical costs: {costs[i]["cost"]:.2f}'
|
||||
)
|
||||
else:
|
||||
all_pairs_match = False
|
||||
@@ -64,15 +64,15 @@ def verify_instance_costs(row: pd.Series) -> float:
|
||||
|
||||
if not abs(total_cost - accumulated) < 1e-6:
|
||||
logger.warning(
|
||||
f"Instance {row['instance_id']}: Cost mismatch: "
|
||||
f"accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, "
|
||||
f'Instance {row["instance_id"]}: Cost mismatch: '
|
||||
f'accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, '
|
||||
)
|
||||
|
||||
return total_cost
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error verifying costs for instance {row.get('instance_id', 'UNKNOWN')}: {e}"
|
||||
f'Error verifying costs for instance {row.get("instance_id", "UNKNOWN")}: {e}'
|
||||
)
|
||||
return 0.0
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
"for FILE_PATH in FILE_PATHS:\n",
|
||||
" with gzip.open(FILE_PATH, 'rb') as f: # Use 'rb' for gzipped files\n",
|
||||
" for i, line in tqdm(\n",
|
||||
" enumerate(f), desc=f\"Processing {FILE_PATH.split('/')[-1]}\"\n",
|
||||
" enumerate(f), desc=f'Processing {FILE_PATH.split(\"/\")[-1]}'\n",
|
||||
" ):\n",
|
||||
" # Parse only the fields we need\n",
|
||||
" raw_data = json.loads(line)\n",
|
||||
|
||||
@@ -54,9 +54,9 @@ logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
|
||||
|
||||
def get_config(instance: pd.Series) -> AppConfig:
|
||||
base_container_image = get_instance_docker_image(instance['instance_id_swebench'])
|
||||
assert (
|
||||
base_container_image
|
||||
), f"Invalid container image for instance {instance['instance_id_swebench']}."
|
||||
assert base_container_image, (
|
||||
f'Invalid container image for instance {instance["instance_id_swebench"]}.'
|
||||
)
|
||||
logger.info(f'Using instance container image: {base_container_image}.')
|
||||
return AppConfig(
|
||||
run_as_openhands=False,
|
||||
@@ -183,9 +183,9 @@ def run_mutation_testing(
|
||||
mutation_action = CmdRunAction(command=f'cat {log_file}')
|
||||
mutation_action.set_hard_timeout(300)
|
||||
mutation_obs = runtime.run_action(mutation_action)
|
||||
assert isinstance(
|
||||
mutation_obs, CmdOutputObservation
|
||||
), 'Failed to retrieve mutation output.'
|
||||
assert isinstance(mutation_obs, CmdOutputObservation), (
|
||||
'Failed to retrieve mutation output.'
|
||||
)
|
||||
return mutation_obs.exit_code, mutation_obs.content
|
||||
|
||||
|
||||
@@ -294,9 +294,9 @@ def process_instance(
|
||||
AssertionError: if the `reset_logger` flag is set without a provided log directory.
|
||||
"""
|
||||
if reset_logger:
|
||||
assert (
|
||||
log_dir is not None
|
||||
), "Can't reset logger without a provided log directory."
|
||||
assert log_dir is not None, (
|
||||
"Can't reset logger without a provided log directory."
|
||||
)
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
||||
else:
|
||||
@@ -528,9 +528,9 @@ if __name__ == '__main__':
|
||||
# Load predictions
|
||||
assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
|
||||
predictions = pd.read_json(args.input_file, lines=True)
|
||||
assert (
|
||||
'instance_id' in predictions.columns
|
||||
), 'Input file must contain instance_id column.'
|
||||
assert 'instance_id' in predictions.columns, (
|
||||
'Input file must contain instance_id column.'
|
||||
)
|
||||
|
||||
if 'test_suite' not in predictions.columns and (
|
||||
'test_result' in predictions.columns
|
||||
@@ -562,9 +562,9 @@ if __name__ == '__main__':
|
||||
lambda x: x['test_suite']
|
||||
)
|
||||
|
||||
assert len(predictions['instance_id'].unique()) == len(
|
||||
predictions
|
||||
), 'instance_id column must be unique.'
|
||||
assert len(predictions['instance_id'].unique()) == len(predictions), (
|
||||
'instance_id column must be unique.'
|
||||
)
|
||||
|
||||
assert {'instance_id_swebench', 'test_suite', 'instance_id'}.issubset(
|
||||
set(predictions.columns)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import sys
|
||||
from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union
|
||||
from typing import Callable, Optional, Sequence, TypeVar, Union
|
||||
|
||||
import nltk
|
||||
import numpy as np
|
||||
@@ -11,7 +11,7 @@ if sys.getrecursionlimit() < 10_000:
|
||||
sys.setrecursionlimit(10_000)
|
||||
|
||||
|
||||
def bleu(gold: List[str], pred: List[str]) -> float:
|
||||
def bleu(gold: list[str], pred: list[str]) -> float:
|
||||
"""
|
||||
Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
|
||||
|
||||
@@ -29,7 +29,7 @@ def bleu(gold: List[str], pred: List[str]) -> float:
|
||||
)
|
||||
|
||||
|
||||
def batch_bleu(golds: List[List[str]], preds: List[List[str]]) -> List[float]:
|
||||
def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
|
||||
"""
|
||||
Calculate BLEU score for a batch of sentences.
|
||||
|
||||
@@ -42,7 +42,7 @@ def batch_bleu(golds: List[List[str]], preds: List[List[str]]) -> List[float]:
|
||||
return [bleu(gold, pred) for gold, pred in zip(golds, preds)]
|
||||
|
||||
|
||||
def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:
|
||||
def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate corpus-level BLEU score for a batch of sentences.
|
||||
|
||||
@@ -61,7 +61,7 @@ def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:
|
||||
|
||||
|
||||
def edit_sim(
|
||||
gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = ' '
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> float:
|
||||
"""
|
||||
Calculate char-level edit similarity, in the range of 0~100.
|
||||
@@ -81,10 +81,10 @@ def edit_sim(
|
||||
|
||||
|
||||
def batch_edit_sim(
|
||||
golds: List[Union[str, List[str]]],
|
||||
preds: List[Union[str, List[str]]],
|
||||
golds: list[Union[str, list[str]]],
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> List[float]:
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate char-level edit similarity for a batch of sentences.
|
||||
|
||||
@@ -114,7 +114,7 @@ def exact_match(gold: T, pred: T) -> float:
|
||||
return 100.0 if gold == pred else 0.0
|
||||
|
||||
|
||||
def batch_exact_match(golds: List[T], preds: List[T]) -> List[float]:
|
||||
def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
|
||||
"""
|
||||
Calculate exact match accuracy for a batch of sentences.
|
||||
|
||||
@@ -128,8 +128,8 @@ def batch_exact_match(golds: List[T], preds: List[T]) -> List[float]:
|
||||
|
||||
|
||||
def rouge_l(
|
||||
gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = ' '
|
||||
) -> Dict[str, float]:
|
||||
gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
|
||||
|
||||
@@ -152,10 +152,10 @@ def rouge_l(
|
||||
|
||||
|
||||
def batch_rouge_l(
|
||||
golds: List[Union[str, List[str]]],
|
||||
preds: List[Union[str, List[str]]],
|
||||
golds: list[Union[str, list[str]]],
|
||||
preds: list[Union[str, list[str]]],
|
||||
sep: str = ' ',
|
||||
) -> Dict[str, List[float]]:
|
||||
) -> dict[str, list[float]]:
|
||||
"""
|
||||
Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
|
||||
|
||||
@@ -171,8 +171,8 @@ def batch_rouge_l(
|
||||
|
||||
|
||||
def accuracy(
|
||||
gold: List[str],
|
||||
pred: List[str],
|
||||
gold: list[str],
|
||||
pred: list[str],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> float:
|
||||
"""
|
||||
@@ -206,10 +206,10 @@ def accuracy(
|
||||
|
||||
|
||||
def batch_accuracy(
|
||||
golds: List[List[str]],
|
||||
preds: List[List[str]],
|
||||
golds: list[list[str]],
|
||||
preds: list[list[str]],
|
||||
ignore: Optional[Sequence[str]] = None,
|
||||
) -> List[float]:
|
||||
) -> list[float]:
|
||||
"""
|
||||
Calculate token-level accuracy for a batch of sentences.
|
||||
|
||||
@@ -224,8 +224,8 @@ def batch_accuracy(
|
||||
|
||||
|
||||
def first_match_to_topk(
|
||||
first_match_list: List[int], k_values: List[int]
|
||||
) -> Dict[int, List[float]]:
|
||||
first_match_list: list[int], k_values: list[int]
|
||||
) -> dict[int, list[float]]:
|
||||
"""
|
||||
Calculate top-k accuracy with the first match ranks (1-indexed).
|
||||
|
||||
@@ -250,7 +250,7 @@ def pass_at_k(n: int, c: int, k: int) -> float:
|
||||
return (1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)).item()) * 100
|
||||
|
||||
|
||||
def self_bleu(samples: List[List[str]]) -> float:
|
||||
def self_bleu(samples: list[list[str]]) -> float:
|
||||
"""
|
||||
Calculate self-BLEU among the samples.
|
||||
:param samples: the chosen m samples
|
||||
@@ -273,7 +273,7 @@ def self_bleu(samples: List[List[str]]) -> float:
|
||||
return np.mean(scores).item()
|
||||
|
||||
|
||||
def self_edit_distance(samples: List[Union[str, List[str]]], sep=' ') -> float:
|
||||
def self_edit_distance(samples: list[Union[str, list[str]]], sep=' ') -> float:
|
||||
"""
|
||||
Calculate self-edit-distance among the samples.
|
||||
:param samples: the chosen m samples
|
||||
@@ -299,7 +299,7 @@ def self_edit_distance(samples: List[Union[str, List[str]]], sep=' ') -> float:
|
||||
return np.mean(scores).item()
|
||||
|
||||
|
||||
QUALITY_METRICS: Dict[str, Callable[[List[str], List[str]], float]] = {
|
||||
QUALITY_METRICS: dict[str, Callable[[list[str], list[str]], float]] = {
|
||||
'bleu': bleu,
|
||||
'xmatch': exact_match,
|
||||
'edit-sim': edit_sim,
|
||||
|
||||
@@ -95,9 +95,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
|
||||
if RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\n'
|
||||
'You SHOULD NEVER attempt to browse the web. '
|
||||
'</IMPORTANT!>\n'
|
||||
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
||||
)
|
||||
|
||||
return instruction
|
||||
@@ -243,7 +241,7 @@ def initialize_runtime(
|
||||
|
||||
# Copy the file to the desired location
|
||||
action = CmdRunAction(
|
||||
command=f"cp /tmp/test_suite.py /testbed/{instance['test_file']}"
|
||||
command=f'cp /tmp/test_suite.py /testbed/{instance["test_file"]}'
|
||||
)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
|
||||
@@ -71,9 +71,10 @@ def process_images(dataset, original_namespace, new_namespace, start_instance_id
|
||||
patch_file_path = 'patch.diff'
|
||||
test_patch_file_path = 'test_patch.diff'
|
||||
|
||||
with open(patch_file_path, 'w') as patch_file, open(
|
||||
test_patch_file_path, 'w'
|
||||
) as test_patch_file:
|
||||
with (
|
||||
open(patch_file_path, 'w') as patch_file,
|
||||
open(test_patch_file_path, 'w') as test_patch_file,
|
||||
):
|
||||
patch_file.write(datum['patch'])
|
||||
test_patch_file.write(datum['test_patch'])
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import ast
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
from evaluation.benchmarks.testgeneval.constants import TestStatus
|
||||
from evaluation.benchmarks.testgeneval.log_parsers import (
|
||||
@@ -37,7 +36,7 @@ def extract_preamble_classes_and_functions(code):
|
||||
|
||||
current_position = 0
|
||||
|
||||
def extract_class_body(code: str, start_index: int) -> Tuple[str, int]:
|
||||
def extract_class_body(code: str, start_index: int) -> tuple[str, int]:
|
||||
"""
|
||||
Extracts the body of a class from the given code starting from the specified index.
|
||||
Returns the class body and the end index of the class body.
|
||||
@@ -168,7 +167,7 @@ def extract_preamble_classes_and_functions(code):
|
||||
|
||||
def filter_passing_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> Tuple[str, List[str], List[str]]:
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests based on their execution results.
|
||||
Returns:
|
||||
@@ -246,7 +245,7 @@ def filter_passing_tests(
|
||||
|
||||
def filter_tests(
|
||||
test_content: str, test_output: str, repo: str
|
||||
) -> Tuple[str, List[str], List[str]]:
|
||||
) -> tuple[str, list[str], list[str]]:
|
||||
"""
|
||||
Filter tests using AST parsing to remove failing test functions from the test file.
|
||||
Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
|
||||
|
||||
@@ -24,7 +24,7 @@ def get_test_directives(instance: TestGenEvalInstance) -> list:
|
||||
return ['test.py']
|
||||
|
||||
# Get test directives from test patch and remove non-test files
|
||||
directives = [f"/testbed/{instance['test_file']}"]
|
||||
directives = [f'/testbed/{instance["test_file"]}']
|
||||
|
||||
# For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
|
||||
if instance['repo'] == 'django/django':
|
||||
@@ -65,8 +65,8 @@ def load_testgeneval_dataset(
|
||||
if ids - dataset_ids:
|
||||
raise ValueError(
|
||||
(
|
||||
"Some instance IDs not found in dataset!"
|
||||
f"\nMissing IDs:\n{' '.join(ids - dataset_ids)}"
|
||||
'Some instance IDs not found in dataset!'
|
||||
f'\nMissing IDs:\n{" ".join(ids - dataset_ids)}'
|
||||
)
|
||||
)
|
||||
dataset = [instance for instance in dataset if instance['id'] in ids]
|
||||
|
||||
@@ -7,7 +7,7 @@ import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import Dict, List, Union
|
||||
from typing import Union
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.events.action import BrowseInteractiveAction
|
||||
@@ -100,7 +100,7 @@ class ClickAction(BrowserAction):
|
||||
return f'click("{self.selector}")'
|
||||
|
||||
|
||||
def parse_content_to_elements(content: str) -> Dict[str, str]:
|
||||
def parse_content_to_elements(content: str) -> dict[str, str]:
|
||||
"""Parse the observation content into a dictionary mapping anchors to their descriptions"""
|
||||
elements = {}
|
||||
current_anchor = None
|
||||
@@ -170,7 +170,7 @@ def resolve_action(action: BrowserAction, content: str) -> BrowserAction:
|
||||
|
||||
def pre_login(
|
||||
runtime: Runtime,
|
||||
services: List[str],
|
||||
services: list[str],
|
||||
save_screenshots=True,
|
||||
screenshots_dir='screenshots',
|
||||
):
|
||||
|
||||
@@ -8,7 +8,6 @@ import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
import yaml
|
||||
from browsing import pre_login
|
||||
@@ -68,7 +67,7 @@ def get_config(
|
||||
return config
|
||||
|
||||
|
||||
def load_dependencies(runtime: Runtime) -> List[str]:
|
||||
def load_dependencies(runtime: Runtime) -> list[str]:
|
||||
"""
|
||||
Every task has a dependencies.yml file, which lists all the services that the
|
||||
task depends on. This function loads the file and returns all dependent service names.
|
||||
@@ -128,7 +127,7 @@ def run_solver(
|
||||
runtime: Runtime,
|
||||
task_name: str,
|
||||
config: AppConfig,
|
||||
dependencies: List[str],
|
||||
dependencies: list[str],
|
||||
save_final_state: bool,
|
||||
state_dir: str,
|
||||
save_screenshots: bool,
|
||||
|
||||
@@ -8,7 +8,6 @@ import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Dict, Tuple
|
||||
|
||||
|
||||
def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
||||
@@ -60,7 +59,7 @@ def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> fl
|
||||
raise ValueError(f'Unknown model: {model}')
|
||||
|
||||
|
||||
def analyze_eval_json_file(filepath: str) -> Tuple[int, int]:
|
||||
def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
|
||||
"""
|
||||
Analyze a single eval JSON file and extract the total and result from final_score.
|
||||
|
||||
@@ -84,7 +83,7 @@ def analyze_eval_json_file(filepath: str) -> Tuple[int, int]:
|
||||
return (0, 0)
|
||||
|
||||
|
||||
def analyze_traj_json_file(filepath: str) -> Tuple[int, float]:
|
||||
def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
|
||||
"""
|
||||
Analyze a single trajectory JSON file and extract the steps and tokens
|
||||
for each step. Then estimate the cost based on the tokens and the model type.
|
||||
@@ -115,7 +114,7 @@ def analyze_traj_json_file(filepath: str) -> Tuple[int, float]:
|
||||
|
||||
def analyze_folder(
|
||||
folder_path: str,
|
||||
) -> Tuple[Dict[str, Tuple[int, int]], Dict[str, Tuple[int, float]]]:
|
||||
) -> tuple[dict[str, tuple[int, int]], dict[str, tuple[int, float]]]:
|
||||
"""
|
||||
Analyze all eval_*.json & traj_*.json files in the specified folder.
|
||||
|
||||
@@ -309,7 +308,9 @@ def main():
|
||||
print(
|
||||
f'| Perfect Completions for {task_nature} | {perfect_completions}/{num_of_tasks} ({perfect_completions / num_of_tasks * 100:.2f}%) |'
|
||||
)
|
||||
print(f'| Average Score for {task_nature} | {task_nature_score*100:.2f}% |')
|
||||
print(
|
||||
f'| Average Score for {task_nature} | {task_nature_score * 100:.2f}% |'
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -64,7 +64,7 @@ def initialize_runtime(runtime: Runtime):
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -80,7 +80,7 @@ def initialize_runtime(runtime: Runtime):
|
||||
|
||||
runtime.add_env_vars({'WOLFRAM_ALPHA_APPID': args.wolfram_alpha_appid})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
|
||||
|
||||
def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
|
||||
|
||||
@@ -100,7 +100,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -116,7 +116,7 @@ def initialize_runtime(
|
||||
goal_image_urls = []
|
||||
if hasattr(obs, 'goal_image_urls'):
|
||||
goal_image_urls = obs.goal_image_urls
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
return goal, goal_image_urls
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
|
||||
@@ -137,7 +137,7 @@ def complete_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {
|
||||
'rewards': json.loads(obs.content),
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ def initialize_runtime(
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
# Set instance id
|
||||
@@ -102,7 +102,7 @@ def initialize_runtime(
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
goal = obs.content
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
||||
return goal
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ def complete_runtime(
|
||||
If you need to do something in the sandbox to get the correctness metric after
|
||||
the agent has run, modify this function.
|
||||
"""
|
||||
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
|
||||
obs: CmdOutputObservation
|
||||
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
|
||||
@@ -123,7 +123,7 @@ def complete_runtime(
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
||||
logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
|
||||
return {
|
||||
'rewards': json.loads(obs.content),
|
||||
}
|
||||
|
||||
@@ -93,14 +93,14 @@ def process_instance(
|
||||
spec = importlib.util.spec_from_file_location(instance_id, instance.file_path)
|
||||
test_module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(test_module)
|
||||
assert hasattr(
|
||||
test_module, 'Test'
|
||||
), f'Test module {instance_id} does not have a Test class'
|
||||
assert hasattr(test_module, 'Test'), (
|
||||
f'Test module {instance_id} does not have a Test class'
|
||||
)
|
||||
|
||||
test_class: type[BaseIntegrationTest] = test_module.Test
|
||||
assert issubclass(
|
||||
test_class, BaseIntegrationTest
|
||||
), f'Test class {instance_id} does not inherit from BaseIntegrationTest'
|
||||
assert issubclass(test_class, BaseIntegrationTest), (
|
||||
f'Test class {instance_id} does not inherit from BaseIntegrationTest'
|
||||
)
|
||||
|
||||
instruction = test_class.INSTRUCTION
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ def run_test_case(test_cases_dir, workspace_dir, request):
|
||||
'python3',
|
||||
f'{SCRIPT_DIR}/../../openhands/main.py',
|
||||
'-d',
|
||||
f"{os.path.join(agent_dir, 'workspace')}",
|
||||
f'{os.path.join(agent_dir, "workspace")}',
|
||||
'-c',
|
||||
f'{agents_ref[agent]}',
|
||||
'-t',
|
||||
@@ -165,7 +165,7 @@ def pytest_configure(config):
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(f"test_results_{now.strftime('%Y%m%d_%H%M%S')}.log"),
|
||||
logging.FileHandler(f'test_results_{now.strftime("%Y%m%d_%H%M%S")}.log'),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
|
||||
@@ -221,9 +221,9 @@ def prepare_dataset(
|
||||
eval_ids: list[str] | None = None,
|
||||
skip_num: int | None = None,
|
||||
):
|
||||
assert (
|
||||
'instance_id' in dataset.columns
|
||||
), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
|
||||
assert 'instance_id' in dataset.columns, (
|
||||
"Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
|
||||
)
|
||||
id_column = 'instance_id'
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_ids: set[str] = set()
|
||||
|
||||
@@ -39,7 +39,9 @@ def refine_prompt(prompt: str):
|
||||
def create_cmd_run_tool(
|
||||
use_short_description: bool = False,
|
||||
) -> ChatCompletionToolParam:
|
||||
description = _SHORT_BASH_DESCRIPTION if use_short_description else _DETAILED_BASH_DESCRIPTION
|
||||
description = (
|
||||
_SHORT_BASH_DESCRIPTION if use_short_description else _DETAILED_BASH_DESCRIPTION
|
||||
)
|
||||
return ChatCompletionToolParam(
|
||||
type='function',
|
||||
function=ChatCompletionToolParamFunctionChunk(
|
||||
|
||||
@@ -131,12 +131,12 @@ upload_file(bid: str, file: str | list[str])
|
||||
|
||||
|
||||
for _, action in _browser_action_space.action_set.items():
|
||||
assert (
|
||||
action.signature in _BROWSER_TOOL_DESCRIPTION
|
||||
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
|
||||
assert (
|
||||
action.description in _BROWSER_TOOL_DESCRIPTION
|
||||
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
|
||||
assert action.signature in _BROWSER_TOOL_DESCRIPTION, (
|
||||
f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
|
||||
)
|
||||
assert action.description in _BROWSER_TOOL_DESCRIPTION, (
|
||||
f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
|
||||
)
|
||||
|
||||
BrowserTool = ChatCompletionToolParam(
|
||||
type='function',
|
||||
|
||||
@@ -52,7 +52,7 @@ class ReadOnlyAgent(CodeActAgent):
|
||||
super().__init__(llm, config)
|
||||
|
||||
logger.debug(
|
||||
f"TOOLS loaded for ReadOnlyAgent: {', '.join([tool.get('function').get('name') for tool in self.tools])}"
|
||||
f'TOOLS loaded for ReadOnlyAgent: {", ".join([tool.get("function").get("name") for tool in self.tools])}'
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@@ -288,7 +288,12 @@ async def main(loop: asyncio.AbstractEventLoop):
|
||||
|
||||
# Use settings from settings store if available and override with command line arguments
|
||||
if settings:
|
||||
config.default_agent = args.agent_cls if args.agent_cls else settings.agent
|
||||
if args.agent_cls:
|
||||
config.default_agent = str(args.agent_cls)
|
||||
else:
|
||||
# settings.agent is not None because we check for it in setup_config_from_args
|
||||
assert settings.agent is not None
|
||||
config.default_agent = settings.agent
|
||||
if not args.llm_config and settings.llm_model and settings.llm_api_key:
|
||||
llm_config = config.get_llm_config()
|
||||
llm_config.model = settings.llm_model
|
||||
|
||||
@@ -549,7 +549,7 @@ def cli_confirm(
|
||||
] + [
|
||||
(
|
||||
'class:selected' if i == selected[0] else 'class:unselected',
|
||||
f"{'> ' if i == selected[0] else ' '}{choice}\n",
|
||||
f'{"> " if i == selected[0] else " "}{choice}\n',
|
||||
)
|
||||
for i, choice in enumerate(choices)
|
||||
]
|
||||
|
||||
@@ -167,17 +167,17 @@ class Agent(ABC):
|
||||
- mcp_tools (list[dict]): The list of MCP tools.
|
||||
"""
|
||||
logger.info(
|
||||
f"Setting {len(mcp_tools)} MCP tools for agent {self.name}: {[tool['function']['name'] for tool in mcp_tools]}"
|
||||
f'Setting {len(mcp_tools)} MCP tools for agent {self.name}: {[tool["function"]["name"] for tool in mcp_tools]}'
|
||||
)
|
||||
for tool in mcp_tools:
|
||||
_tool = ChatCompletionToolParam(**tool)
|
||||
if _tool['function']['name'] in self.mcp_tools:
|
||||
logger.warning(
|
||||
f"Tool {_tool['function']['name']} already exists, skipping"
|
||||
f'Tool {_tool["function"]["name"]} already exists, skipping'
|
||||
)
|
||||
continue
|
||||
self.mcp_tools[_tool['function']['name']] = _tool
|
||||
self.tools.append(_tool)
|
||||
logger.info(
|
||||
f"Tools updated for agent {self.name}, total {len(self.tools)}: {[tool['function']['name'] for tool in self.tools]}"
|
||||
f'Tools updated for agent {self.name}, total {len(self.tools)}: {[tool["function"]["name"] for tool in self.tools]}'
|
||||
)
|
||||
|
||||
@@ -220,7 +220,7 @@ class State:
|
||||
'trace_version': openhands.__version__,
|
||||
'tags': [
|
||||
f'agent:{agent_name}',
|
||||
f"web_host:{os.environ.get('WEB_HOST', 'unspecified')}",
|
||||
f'web_host:{os.environ.get("WEB_HOST", "unspecified")}',
|
||||
f'openhands_version:{openhands.__version__}',
|
||||
],
|
||||
}
|
||||
|
||||
@@ -142,9 +142,9 @@ async def run_controller(
|
||||
agent, runtime, config, replay_events=replay_events
|
||||
)
|
||||
|
||||
assert isinstance(
|
||||
initial_user_action, Action
|
||||
), f'initial user actions must be an Action, got {type(initial_user_action)}'
|
||||
assert isinstance(initial_user_action, Action), (
|
||||
f'initial user actions must be an Action, got {type(initial_user_action)}'
|
||||
)
|
||||
logger.debug(
|
||||
f'Agent Controller Initialized: Running agent {agent.name}, model '
|
||||
f'{agent.llm.config.model}, with actions: {initial_user_action}'
|
||||
|
||||
@@ -149,9 +149,9 @@ class Message(BaseModel):
|
||||
|
||||
# an observation message with tool response
|
||||
if self.tool_call_id is not None:
|
||||
assert (
|
||||
self.name is not None
|
||||
), 'name is required when tool_call_id is not None'
|
||||
assert self.name is not None, (
|
||||
'name is required when tool_call_id is not None'
|
||||
)
|
||||
message_dict['tool_call_id'] = self.tool_call_id
|
||||
message_dict['name'] = self.name
|
||||
|
||||
|
||||
@@ -36,9 +36,7 @@ class BrowseInteractiveAction(Action):
|
||||
|
||||
@property
|
||||
def message(self) -> str:
|
||||
return (
|
||||
f'I am interacting with the browser:\n' f'```\n{self.browser_actions}\n```'
|
||||
)
|
||||
return f'I am interacting with the browser:\n```\n{self.browser_actions}\n```'
|
||||
|
||||
def __str__(self) -> str:
|
||||
ret = '**BrowseInteractiveAction**\n'
|
||||
|
||||
@@ -186,9 +186,9 @@ class FileEditObservation(Observation):
|
||||
return self.content
|
||||
|
||||
if not self.prev_exist:
|
||||
assert (
|
||||
self.old_content == ''
|
||||
), 'old_content should be empty if the file is new (prev_exist=False).'
|
||||
assert self.old_content == '', (
|
||||
'old_content should be empty if the file is new (prev_exist=False).'
|
||||
)
|
||||
return f'[New file {self.path} is created with the provided content.]\n'
|
||||
|
||||
# Use cached diff if available, otherwise compute it
|
||||
|
||||
@@ -277,7 +277,7 @@ class GitHubService(BaseGitService, GitService):
|
||||
result = response.json()
|
||||
if 'errors' in result:
|
||||
raise UnknownException(
|
||||
f"GraphQL query error: {json.dumps(result['errors'])}"
|
||||
f'GraphQL query error: {json.dumps(result["errors"])}'
|
||||
)
|
||||
|
||||
return dict(result)
|
||||
|
||||
@@ -253,12 +253,12 @@ def convert_tool_call_to_string(tool_call: dict) -> str:
|
||||
if tool_call['type'] != 'function':
|
||||
raise FunctionCallConversionError("Tool call type must be 'function'.")
|
||||
|
||||
ret = f"<function={tool_call['function']['name']}>\n"
|
||||
ret = f'<function={tool_call["function"]["name"]}>\n'
|
||||
try:
|
||||
args = json.loads(tool_call['function']['arguments'])
|
||||
except json.JSONDecodeError as e:
|
||||
raise FunctionCallConversionError(
|
||||
f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
|
||||
f'Failed to parse arguments as JSON. Arguments: {tool_call["function"]["arguments"]}'
|
||||
) from e
|
||||
for param_name, param_value in args.items():
|
||||
is_multiline = isinstance(param_value, str) and '\n' in param_value
|
||||
@@ -280,8 +280,8 @@ def convert_tools_to_description(tools: list[dict]) -> str:
|
||||
fn = tool['function']
|
||||
if i > 0:
|
||||
ret += '\n'
|
||||
ret += f"---- BEGIN FUNCTION #{i+1}: {fn['name']} ----\n"
|
||||
ret += f"Description: {fn['description']}\n"
|
||||
ret += f'---- BEGIN FUNCTION #{i + 1}: {fn["name"]} ----\n'
|
||||
ret += f'Description: {fn["description"]}\n'
|
||||
|
||||
if 'parameters' in fn:
|
||||
ret += 'Parameters:\n'
|
||||
@@ -790,14 +790,14 @@ def convert_from_multiple_tool_calls_to_single_tool_call_messages(
|
||||
# add the tool result
|
||||
converted_messages.append(message)
|
||||
else:
|
||||
assert (
|
||||
len(pending_tool_calls) == 0
|
||||
), f'Found pending tool calls but not found in pending list: {pending_tool_calls=}'
|
||||
assert len(pending_tool_calls) == 0, (
|
||||
f'Found pending tool calls but not found in pending list: {pending_tool_calls=}'
|
||||
)
|
||||
converted_messages.append(message)
|
||||
else:
|
||||
assert (
|
||||
len(pending_tool_calls) == 0
|
||||
), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
|
||||
assert len(pending_tool_calls) == 0, (
|
||||
f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
|
||||
)
|
||||
converted_messages.append(message)
|
||||
|
||||
if not ignore_final_tool_result and len(pending_tool_calls) > 0:
|
||||
|
||||
@@ -158,12 +158,12 @@ async def add_mcp_tools_to_agent(
|
||||
ActionExecutionClient, # inline import to avoid circular import
|
||||
)
|
||||
|
||||
assert isinstance(
|
||||
runtime, ActionExecutionClient
|
||||
), 'Runtime must be an instance of ActionExecutionClient'
|
||||
assert (
|
||||
runtime.runtime_initialized
|
||||
), 'Runtime must be initialized before adding MCP tools'
|
||||
assert isinstance(runtime, ActionExecutionClient), (
|
||||
'Runtime must be an instance of ActionExecutionClient'
|
||||
)
|
||||
assert runtime.runtime_initialized, (
|
||||
'Runtime must be initialized before adding MCP tools'
|
||||
)
|
||||
|
||||
# Add the runtime as another MCP server
|
||||
updated_mcp_config = runtime.get_updated_mcp_config()
|
||||
@@ -171,7 +171,7 @@ async def add_mcp_tools_to_agent(
|
||||
mcp_tools = await fetch_mcp_tools_from_config(updated_mcp_config)
|
||||
|
||||
logger.info(
|
||||
f"Loaded {len(mcp_tools)} MCP tools: {[tool['function']['name'] for tool in mcp_tools]}"
|
||||
f'Loaded {len(mcp_tools)} MCP tools: {[tool["function"]["name"] for tool in mcp_tools]}'
|
||||
)
|
||||
|
||||
# Set the MCP tools on the agent
|
||||
|
||||
@@ -214,7 +214,7 @@ class GitlabIssueHandler(IssueHandlerInterface):
|
||||
|
||||
def reply_to_comment(self, pr_number: int, comment_id: str, reply: str) -> None:
|
||||
response = httpx.get(
|
||||
f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split('/')[-1]}',
|
||||
f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split("/")[-1]}',
|
||||
headers=self.headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
@@ -225,7 +225,7 @@ class GitlabIssueHandler(IssueHandlerInterface):
|
||||
'note_id': discussions.get('notes', [])[-1]['id'],
|
||||
}
|
||||
response = httpx.post(
|
||||
f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split('/')[-1]}/notes',
|
||||
f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split("/")[-1]}/notes',
|
||||
headers=self.headers,
|
||||
json=data,
|
||||
)
|
||||
|
||||
@@ -99,7 +99,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
|
||||
logger.info(f'Build status: {status}')
|
||||
|
||||
if status == 'SUCCESS':
|
||||
logger.debug(f"Successfully built {status_data['image']}")
|
||||
logger.debug(f'Successfully built {status_data["image"]}')
|
||||
return str(status_data['image'])
|
||||
elif status in [
|
||||
'FAILURE',
|
||||
@@ -139,9 +139,9 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
|
||||
|
||||
if result['exists']:
|
||||
logger.debug(
|
||||
f"Image {image_name} exists. "
|
||||
f"Uploaded at: {result['image']['upload_time']}, "
|
||||
f"Size: {result['image']['image_size_bytes'] / 1024 / 1024:.2f} MB"
|
||||
f'Image {image_name} exists. '
|
||||
f'Uploaded at: {result["image"]["upload_time"]}, '
|
||||
f'Size: {result["image"]["image_size_bytes"] / 1024 / 1024:.2f} MB'
|
||||
)
|
||||
else:
|
||||
logger.debug(f'Image {image_name} does not exist.')
|
||||
|
||||
@@ -115,12 +115,12 @@ class DaytonaRuntime(ActionExecutionClient):
|
||||
|
||||
def _construct_api_url(self, port: int) -> str:
|
||||
assert self.workspace is not None, 'Workspace is not initialized'
|
||||
assert (
|
||||
self.workspace.instance.info is not None
|
||||
), 'Workspace info is not available'
|
||||
assert (
|
||||
self.workspace.instance.info.provider_metadata is not None
|
||||
), 'Provider metadata is not available'
|
||||
assert self.workspace.instance.info is not None, (
|
||||
'Workspace info is not available'
|
||||
)
|
||||
assert self.workspace.instance.info.provider_metadata is not None, (
|
||||
'Provider metadata is not available'
|
||||
)
|
||||
|
||||
node_domain = json.loads(self.workspace.instance.info.provider_metadata)[
|
||||
'nodeDomain'
|
||||
|
||||
@@ -40,9 +40,9 @@ class E2BBox:
|
||||
|
||||
def _archive(self, host_src: str, recursive: bool = False):
|
||||
if recursive:
|
||||
assert os.path.isdir(
|
||||
host_src
|
||||
), 'Source must be a directory when recursive is True'
|
||||
assert os.path.isdir(host_src), (
|
||||
'Source must be a directory when recursive is True'
|
||||
)
|
||||
files = glob(host_src + '/**/*', recursive=True)
|
||||
srcname = os.path.basename(host_src)
|
||||
tar_filename = os.path.join(os.path.dirname(host_src), srcname + '.tar')
|
||||
@@ -52,9 +52,9 @@ class E2BBox:
|
||||
file, arcname=os.path.relpath(file, os.path.dirname(host_src))
|
||||
)
|
||||
else:
|
||||
assert os.path.isfile(
|
||||
host_src
|
||||
), 'Source must be a file when recursive is False'
|
||||
assert os.path.isfile(host_src), (
|
||||
'Source must be a file when recursive is False'
|
||||
)
|
||||
srcname = os.path.basename(host_src)
|
||||
tar_filename = os.path.join(os.path.dirname(host_src), srcname + '.tar')
|
||||
with tarfile.open(tar_filename, mode='w') as tar:
|
||||
|
||||
@@ -130,12 +130,12 @@ class RemoteRuntime(ActionExecutionClient):
|
||||
)
|
||||
self.container_image = self.config.sandbox.runtime_container_image
|
||||
self._start_runtime()
|
||||
assert (
|
||||
self.runtime_id is not None
|
||||
), 'Runtime ID is not set. This should never happen.'
|
||||
assert (
|
||||
self.runtime_url is not None
|
||||
), 'Runtime URL is not set. This should never happen.'
|
||||
assert self.runtime_id is not None, (
|
||||
'Runtime ID is not set. This should never happen.'
|
||||
)
|
||||
assert self.runtime_url is not None, (
|
||||
'Runtime URL is not set. This should never happen.'
|
||||
)
|
||||
self.send_status_message('STATUS$WAITING_FOR_CLIENT')
|
||||
if not self.attach_to_existing:
|
||||
self.log('info', 'Waiting for runtime to be alive...')
|
||||
|
||||
@@ -189,7 +189,7 @@ class JupyterKernel:
|
||||
|
||||
if os.environ.get('DEBUG'):
|
||||
logging.info(
|
||||
f"MSG TYPE: {msg_type.upper()} DONE:{execution_done}\nCONTENT: {msg_dict['content']}"
|
||||
f'MSG TYPE: {msg_type.upper()} DONE:{execution_done}\nCONTENT: {msg_dict["content"]}'
|
||||
)
|
||||
|
||||
if msg_type == 'error':
|
||||
@@ -203,7 +203,7 @@ class JupyterKernel:
|
||||
if 'image/png' in msg_dict['content']['data']:
|
||||
# use markdone to display image (in case of large image)
|
||||
outputs.append(
|
||||
f"\n\n"
|
||||
f'\n\n'
|
||||
)
|
||||
|
||||
elif msg_type == 'execute_reply':
|
||||
@@ -272,7 +272,7 @@ class ExecuteHandler(tornado.web.RequestHandler):
|
||||
|
||||
def make_app() -> tornado.web.Application:
|
||||
jupyter_kernel = JupyterKernel(
|
||||
f"localhost:{os.environ.get('JUPYTER_GATEWAY_PORT', '8888')}",
|
||||
f'localhost:{os.environ.get("JUPYTER_GATEWAY_PORT", "8888")}',
|
||||
os.environ.get('JUPYTER_GATEWAY_KERNEL_ID', 'default'),
|
||||
)
|
||||
asyncio.get_event_loop().run_until_complete(jupyter_kernel.initialize())
|
||||
|
||||
@@ -501,9 +501,9 @@ class BashSession:
|
||||
if len(splited_commands) > 1:
|
||||
return ErrorObservation(
|
||||
content=(
|
||||
f"ERROR: Cannot execute multiple commands at once.\n"
|
||||
f"Please run each command separately OR chain them into a single command via && or ;\n"
|
||||
f"Provided commands:\n{'\n'.join(f'({i + 1}) {cmd}' for i, cmd in enumerate(splited_commands))}"
|
||||
f'ERROR: Cannot execute multiple commands at once.\n'
|
||||
f'Please run each command separately OR chain them into a single command via && or ;\n'
|
||||
f'Provided commands:\n{"\n".join(f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_commands))}'
|
||||
)
|
||||
)
|
||||
|
||||
@@ -591,8 +591,8 @@ class BashSession:
|
||||
logger.debug(
|
||||
f'PANE CONTENT GOT after {time.time() - _start_time:.2f} seconds'
|
||||
)
|
||||
logger.debug(f"BEGIN OF PANE CONTENT: {cur_pane_output.split('\n')[:10]}")
|
||||
logger.debug(f"END OF PANE CONTENT: {cur_pane_output.split('\n')[-10:]}")
|
||||
logger.debug(f'BEGIN OF PANE CONTENT: {cur_pane_output.split("\n")[:10]}')
|
||||
logger.debug(f'END OF PANE CONTENT: {cur_pane_output.split("\n")[-10:]}')
|
||||
ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_pane_output)
|
||||
current_ps1_count = len(ps1_matches)
|
||||
|
||||
|
||||
@@ -35,8 +35,8 @@ def generate_file_viewer_html(file_path: str) -> str:
|
||||
# Check if the file extension is supported
|
||||
if file_extension not in supported_extensions:
|
||||
raise ValueError(
|
||||
f"Unsupported file extension: {file_extension}. "
|
||||
f"Supported extensions are: {', '.join(supported_extensions)}"
|
||||
f'Unsupported file extension: {file_extension}. '
|
||||
f'Supported extensions are: {", ".join(supported_extensions)}'
|
||||
)
|
||||
|
||||
# Check if the file exists
|
||||
|
||||
@@ -385,9 +385,9 @@ if __name__ == '__main__':
|
||||
# and create a Dockerfile dynamically and place it in the build_folder only. This allows the Docker image to
|
||||
# then be created using the Dockerfile (most likely using the containers/build.sh script)
|
||||
build_folder = args.build_folder
|
||||
assert os.path.exists(
|
||||
build_folder
|
||||
), f'Build folder {build_folder} does not exist'
|
||||
assert os.path.exists(build_folder), (
|
||||
f'Build folder {build_folder} does not exist'
|
||||
)
|
||||
logger.debug(
|
||||
f'Copying the source code and generating the Dockerfile in the build folder: {build_folder}'
|
||||
)
|
||||
|
||||
@@ -176,9 +176,9 @@ class InvariantAnalyzer(SecurityAnalyzer):
|
||||
],
|
||||
)
|
||||
)
|
||||
assert (
|
||||
self.guardrail_llm is not None
|
||||
), 'InvariantAnalyzer.guardrail_llm should be initialized before calling check_usertask'
|
||||
assert self.guardrail_llm is not None, (
|
||||
'InvariantAnalyzer.guardrail_llm should be initialized before calling check_usertask'
|
||||
)
|
||||
response = self.guardrail_llm.completion(
|
||||
messages=self.guardrail_llm.format_messages_for_llm(messages),
|
||||
stop=['.'],
|
||||
@@ -261,9 +261,9 @@ class InvariantAnalyzer(SecurityAnalyzer):
|
||||
],
|
||||
)
|
||||
)
|
||||
assert (
|
||||
self.guardrail_llm is not None
|
||||
), 'InvariantAnalyzer.guardrail_llm should be initialized before calling check_fillaction'
|
||||
assert self.guardrail_llm is not None, (
|
||||
'InvariantAnalyzer.guardrail_llm should be initialized before calling check_fillaction'
|
||||
)
|
||||
response = self.guardrail_llm.completion(
|
||||
messages=self.guardrail_llm.format_messages_for_llm(messages),
|
||||
stop=['.'],
|
||||
|
||||
@@ -20,7 +20,7 @@ TraceElement = Message | ToolCall | ToolOutput | Function
|
||||
|
||||
|
||||
def get_next_id(trace: list[TraceElement]) -> str:
|
||||
used_ids = [el.id for el in trace if type(el) == ToolCall]
|
||||
used_ids = [el.id for el in trace if isinstance(el, ToolCall)]
|
||||
for i in range(1, len(used_ids) + 2):
|
||||
if str(i) not in used_ids:
|
||||
return str(i)
|
||||
@@ -31,7 +31,7 @@ def get_last_id(
|
||||
trace: list[TraceElement],
|
||||
) -> str | None:
|
||||
for el in reversed(trace):
|
||||
if type(el) == ToolCall:
|
||||
if isinstance(el, ToolCall):
|
||||
return el.id
|
||||
return None
|
||||
|
||||
@@ -39,12 +39,12 @@ def get_last_id(
|
||||
def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement]:
|
||||
next_id = get_next_id(trace)
|
||||
inv_trace: list[TraceElement] = []
|
||||
if type(action) == MessageAction:
|
||||
if isinstance(action, MessageAction):
|
||||
if action.source == EventSource.USER:
|
||||
inv_trace.append(Message(role='user', content=action.content))
|
||||
else:
|
||||
inv_trace.append(Message(role='assistant', content=action.content))
|
||||
elif type(action) in [NullAction, ChangeAgentStateAction]:
|
||||
elif isinstance(action, (NullAction, ChangeAgentStateAction)):
|
||||
pass
|
||||
elif hasattr(action, 'action') and action.action is not None:
|
||||
event_dict = event_to_dict(action)
|
||||
@@ -63,7 +63,7 @@ def parse_observation(
|
||||
trace: list[TraceElement], obs: Observation
|
||||
) -> list[TraceElement]:
|
||||
last_id = get_last_id(trace)
|
||||
if type(obs) in [NullObservation, AgentStateChangedObservation]:
|
||||
if isinstance(obs, (NullObservation, AgentStateChangedObservation)):
|
||||
return []
|
||||
elif hasattr(obs, 'content') and obs.content is not None:
|
||||
return [ToolOutput(role='tool', content=obs.content, tool_call_id=last_id)]
|
||||
|
||||
@@ -7,14 +7,12 @@ from typing import Callable, Iterable
|
||||
import socketio
|
||||
|
||||
from openhands.core.config.app_config import AppConfig
|
||||
from openhands.core.config.llm_config import LLMConfig
|
||||
from openhands.core.exceptions import AgentRuntimeUnavailableError
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.schema.agent import AgentState
|
||||
from openhands.events.action import MessageAction
|
||||
from openhands.events.event import EventSource
|
||||
from openhands.events.event_store import EventStore
|
||||
from openhands.events.stream import EventStream, EventStreamSubscriber, session_exists
|
||||
from openhands.events.stream import EventStreamSubscriber, session_exists
|
||||
from openhands.server.config.server_config import ServerConfig
|
||||
from openhands.server.monitoring import MonitoringListener
|
||||
from openhands.server.session.agent_session import WAIT_TIME_BEFORE_CLOSE
|
||||
@@ -25,7 +23,10 @@ from openhands.storage.data_models.conversation_metadata import ConversationMeta
|
||||
from openhands.storage.data_models.settings import Settings
|
||||
from openhands.storage.files import FileStore
|
||||
from openhands.utils.async_utils import GENERAL_TIMEOUT, call_async_from_sync, wait_all
|
||||
from openhands.utils.conversation_summary import get_default_conversation_title, auto_generate_title
|
||||
from openhands.utils.conversation_summary import (
|
||||
auto_generate_title,
|
||||
get_default_conversation_title,
|
||||
)
|
||||
from openhands.utils.import_utils import get_impl
|
||||
from openhands.utils.shutdown_listener import should_continue
|
||||
|
||||
@@ -208,7 +209,6 @@ class StandaloneConversationManager(ConversationManager):
|
||||
store = await conversation_store_class.get_instance(self.config, user_id)
|
||||
return store
|
||||
|
||||
|
||||
async def get_running_agent_loops(
|
||||
self, user_id: str | None = None, filter_to_sids: set[str] | None = None
|
||||
) -> set[str]:
|
||||
@@ -287,7 +287,7 @@ class StandaloneConversationManager(ConversationManager):
|
||||
response_ids = await self.get_running_agent_loops(user_id)
|
||||
if len(response_ids) >= self.config.max_concurrent_conversations:
|
||||
logger.info(
|
||||
f'too_many_sessions_for:{user_id or ''}',
|
||||
f'too_many_sessions_for:{user_id or ""}',
|
||||
extra={'session_id': sid, 'user_id': user_id},
|
||||
)
|
||||
# Get the conversations sorted (oldest first)
|
||||
@@ -300,7 +300,7 @@ class StandaloneConversationManager(ConversationManager):
|
||||
while len(conversations) >= self.config.max_concurrent_conversations:
|
||||
oldest_conversation_id = conversations.pop().conversation_id
|
||||
logger.debug(
|
||||
f'closing_from_too_many_sessions:{user_id or ''}:{oldest_conversation_id}',
|
||||
f'closing_from_too_many_sessions:{user_id or ""}:{oldest_conversation_id}',
|
||||
extra={'session_id': oldest_conversation_id, 'user_id': user_id},
|
||||
)
|
||||
# Send status message to client and close session.
|
||||
@@ -332,7 +332,9 @@ class StandaloneConversationManager(ConversationManager):
|
||||
try:
|
||||
session.agent_session.event_stream.subscribe(
|
||||
EventStreamSubscriber.SERVER,
|
||||
self._create_conversation_update_callback(user_id, github_user_id, sid, settings),
|
||||
self._create_conversation_update_callback(
|
||||
user_id, github_user_id, sid, settings
|
||||
),
|
||||
UPDATED_AT_CALLBACK_ID,
|
||||
)
|
||||
except ValueError:
|
||||
@@ -429,7 +431,11 @@ class StandaloneConversationManager(ConversationManager):
|
||||
)
|
||||
|
||||
def _create_conversation_update_callback(
|
||||
self, user_id: str | None, github_user_id: str | None, conversation_id: str, settings: Settings
|
||||
self,
|
||||
user_id: str | None,
|
||||
github_user_id: str | None,
|
||||
conversation_id: str,
|
||||
settings: Settings,
|
||||
) -> Callable:
|
||||
def callback(event, *args, **kwargs):
|
||||
call_async_from_sync(
|
||||
@@ -444,9 +450,13 @@ class StandaloneConversationManager(ConversationManager):
|
||||
|
||||
return callback
|
||||
|
||||
|
||||
async def _update_conversation_for_event(
|
||||
self, user_id: str, github_user_id: str, conversation_id: str, settings: Settings, event=None
|
||||
self,
|
||||
user_id: str,
|
||||
github_user_id: str,
|
||||
conversation_id: str,
|
||||
settings: Settings,
|
||||
event=None,
|
||||
):
|
||||
conversation_store = await self._get_conversation_store(user_id, github_user_id)
|
||||
conversation = await conversation_store.get_metadata(conversation_id)
|
||||
@@ -469,8 +479,12 @@ class StandaloneConversationManager(ConversationManager):
|
||||
token_usage.prompt_tokens + token_usage.completion_tokens
|
||||
)
|
||||
default_title = get_default_conversation_title(conversation_id)
|
||||
if conversation.title == default_title: # attempt to autogenerate if default title is in use
|
||||
title = await auto_generate_title(conversation_id, user_id, self.file_store, settings)
|
||||
if (
|
||||
conversation.title == default_title
|
||||
): # attempt to autogenerate if default title is in use
|
||||
title = await auto_generate_title(
|
||||
conversation_id, user_id, self.file_store, settings
|
||||
)
|
||||
if title and not title.isspace():
|
||||
conversation.title = title
|
||||
try:
|
||||
|
||||
@@ -27,7 +27,7 @@ def store_feedback(feedback: FeedbackDataModel) -> dict[str, str]:
|
||||
display_feedback = feedback.model_dump()
|
||||
if 'trajectory' in display_feedback:
|
||||
display_feedback['trajectory'] = (
|
||||
f"elided [length: {len(display_feedback['trajectory'])}"
|
||||
f'elided [length: {len(display_feedback["trajectory"])}'
|
||||
)
|
||||
if 'token' in display_feedback:
|
||||
display_feedback['token'] = 'elided'
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import asyncio
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
[build-system]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
requires = [
|
||||
"poetry-core",
|
||||
]
|
||||
|
||||
[tool.poetry]
|
||||
name = "openhands-ai"
|
||||
version = "0.37.0"
|
||||
@@ -9,7 +15,7 @@ repository = "https://github.com/All-Hands-AI/OpenHands"
|
||||
packages = [
|
||||
{ include = "openhands/**/*" },
|
||||
{ include = "pyproject.toml", to = "openhands" },
|
||||
{ include = "poetry.lock", to = "openhands" }
|
||||
{ include = "poetry.lock", to = "openhands" },
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
@@ -97,39 +103,12 @@ pandas = "*"
|
||||
reportlab = "*"
|
||||
gevent = ">=24.2.1,<26.0.0"
|
||||
|
||||
[tool.coverage.run]
|
||||
concurrency = ["gevent"]
|
||||
|
||||
|
||||
[tool.poetry.group.runtime.dependencies]
|
||||
jupyterlab = "*"
|
||||
notebook = "*"
|
||||
jupyter_kernel_gateway = "*"
|
||||
flake8 = "*"
|
||||
|
||||
[build-system]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
requires = [
|
||||
"poetry-core",
|
||||
]
|
||||
|
||||
[tool.autopep8]
|
||||
# autopep8 fights with mypy on line length issue
|
||||
ignore = [ "E501" ]
|
||||
|
||||
[tool.black]
|
||||
# prevent black (if installed) from changing single quotes to double quotes
|
||||
skip-string-normalization = true
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["D"]
|
||||
# ignore warnings for missing docstrings
|
||||
ignore = ["D1"]
|
||||
|
||||
[tool.ruff.lint.pydocstyle]
|
||||
convention = "google"
|
||||
|
||||
|
||||
[tool.poetry.group.evaluation.dependencies]
|
||||
streamlit = "*"
|
||||
whatthepatch = "*"
|
||||
@@ -152,10 +131,6 @@ boto3-stubs = {extras = ["s3"], version = "^1.37.19"}
|
||||
pyarrow = "20.0.0" # transitive dependency, pinned here to avoid conflicts
|
||||
datasets = "*"
|
||||
|
||||
[tool.poetry-dynamic-versioning]
|
||||
enable = true
|
||||
style = "semver"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
openhands = "openhands.core.cli:main"
|
||||
|
||||
@@ -164,3 +139,24 @@ fuzzywuzzy = "^0.18.0"
|
||||
rouge = "^1.0.1"
|
||||
python-levenshtein = ">=0.26.1,<0.28.0"
|
||||
tree-sitter-python = "^0.23.6"
|
||||
|
||||
[tool.poetry-dynamic-versioning]
|
||||
enable = true
|
||||
style = "semver"
|
||||
|
||||
[tool.autopep8]
|
||||
# autopep8 fights with mypy on line length issue
|
||||
ignore = [ "E501" ]
|
||||
|
||||
[tool.black]
|
||||
# prevent black (if installed) from changing single quotes to double quotes
|
||||
skip-string-normalization = true
|
||||
|
||||
[tool.ruff]
|
||||
lint.select = [ "D" ]
|
||||
# ignore warnings for missing docstrings
|
||||
lint.ignore = [ "D1" ]
|
||||
lint.pydocstyle.convention = "google"
|
||||
|
||||
[tool.coverage.run]
|
||||
concurrency = [ "gevent" ]
|
||||
|
||||
@@ -760,9 +760,9 @@ def test_python_version(temp_dir, runtime_cls, run_as_openhands):
|
||||
try:
|
||||
obs = runtime.run_action(CmdRunAction(command='python --version'))
|
||||
|
||||
assert isinstance(
|
||||
obs, CmdOutputObservation
|
||||
), 'The observation should be a CmdOutputObservation.'
|
||||
assert isinstance(obs, CmdOutputObservation), (
|
||||
'The observation should be a CmdOutputObservation.'
|
||||
)
|
||||
assert obs.exit_code == 0, 'The exit code should be 0.'
|
||||
assert 'Python 3' in obs.content, 'The output should contain "Python 3".'
|
||||
finally:
|
||||
|
||||
@@ -25,9 +25,9 @@ def test_env_vars_os_environ(temp_dir, runtime_cls, run_as_openhands):
|
||||
)
|
||||
print(obs)
|
||||
assert obs.exit_code == 0, 'The exit code should be 0.'
|
||||
assert (
|
||||
obs.content.strip().split('\n\r')[0].strip() == 'BAZ'
|
||||
), f'Output: [{obs.content}] for {runtime_cls}'
|
||||
assert obs.content.strip().split('\n\r')[0].strip() == 'BAZ', (
|
||||
f'Output: [{obs.content}] for {runtime_cls}'
|
||||
)
|
||||
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
@@ -168,9 +168,9 @@ def test_grep_to_cmdrun_paths_with_spaces(runtime_cls, run_as_openhands, temp_di
|
||||
|
||||
obs = _run_cmd_action(runtime, cmd)
|
||||
assert obs.exit_code == 0, f'Grep command failed for path: {path}'
|
||||
assert (
|
||||
'function' in obs.content
|
||||
), f'Expected pattern not found in output for path: {path}'
|
||||
assert 'function' in obs.content, (
|
||||
f'Expected pattern not found in output for path: {path}'
|
||||
)
|
||||
|
||||
# Verify the actual file was found
|
||||
if path == 'src/my project':
|
||||
|
||||
@@ -77,9 +77,9 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
|
||||
action_read = FileReadAction(path='hello.sh')
|
||||
logger.info(action_read, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_read)
|
||||
assert isinstance(
|
||||
obs, FileReadObservation
|
||||
), 'The observation should be a FileReadObservation.'
|
||||
assert isinstance(obs, FileReadObservation), (
|
||||
'The observation should be a FileReadObservation.'
|
||||
)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
assert obs.content == 'echo "Hello, World!"\n'
|
||||
|
||||
@@ -39,9 +39,9 @@ def test_edit_from_scratch(temp_dir, runtime_cls, run_as_openhands):
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
assert isinstance(
|
||||
obs, FileEditObservation
|
||||
), 'The observation should be a FileEditObservation.'
|
||||
assert isinstance(obs, FileEditObservation), (
|
||||
'The observation should be a FileEditObservation.'
|
||||
)
|
||||
|
||||
action = FileReadAction(
|
||||
path=os.path.join('/workspace', 'app.py'),
|
||||
@@ -78,9 +78,9 @@ def test_edit(temp_dir, runtime_cls, run_as_openhands):
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
assert isinstance(
|
||||
obs, FileEditObservation
|
||||
), 'The observation should be a FileEditObservation.'
|
||||
assert isinstance(obs, FileEditObservation), (
|
||||
'The observation should be a FileEditObservation.'
|
||||
)
|
||||
|
||||
action = FileReadAction(
|
||||
path=os.path.join('/workspace', 'app.py'),
|
||||
@@ -138,9 +138,9 @@ def test_edit_long_file(temp_dir, runtime_cls, run_as_openhands):
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
assert isinstance(
|
||||
obs, FileEditObservation
|
||||
), 'The observation should be a FileEditObservation.'
|
||||
assert isinstance(obs, FileEditObservation), (
|
||||
'The observation should be a FileEditObservation.'
|
||||
)
|
||||
|
||||
action = FileReadAction(
|
||||
path=os.path.join('/workspace', 'app.py'),
|
||||
|
||||
@@ -23,9 +23,9 @@ from openhands.events.observation import CmdOutputObservation, MCPObservation
|
||||
def test_default_activated_tools():
|
||||
project_root = os.path.dirname(openhands.__file__)
|
||||
mcp_config_path = os.path.join(project_root, 'runtime', 'mcp', 'config.json')
|
||||
assert os.path.exists(
|
||||
mcp_config_path
|
||||
), f'MCP config file not found at {mcp_config_path}'
|
||||
assert os.path.exists(mcp_config_path), (
|
||||
f'MCP config file not found at {mcp_config_path}'
|
||||
)
|
||||
with open(mcp_config_path, 'r') as f:
|
||||
mcp_config = json.load(f)
|
||||
assert 'default' in mcp_config
|
||||
@@ -63,9 +63,9 @@ async def test_fetch_mcp_via_stdio(temp_dir, runtime_cls, run_as_openhands):
|
||||
mcp_action = MCPAction(name='fetch', arguments={'url': 'http://localhost:8000'})
|
||||
obs = await runtime.call_tool_mcp(mcp_action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(
|
||||
obs, MCPObservation
|
||||
), 'The observation should be a MCPObservation.'
|
||||
assert isinstance(obs, MCPObservation), (
|
||||
'The observation should be a MCPObservation.'
|
||||
)
|
||||
|
||||
result_json = json.loads(obs.content)
|
||||
assert not result_json['isError']
|
||||
|
||||
@@ -468,9 +468,9 @@ def test_stress_runtime_memory_limits_with_repeated_file_edit():
|
||||
new_str=f'-content_{i:03d}',
|
||||
)
|
||||
obs = runtime.run_action(edit_action)
|
||||
assert (
|
||||
f'The file {test_file} has been edited' in obs.content
|
||||
), f'Edit failed at iteration {i}'
|
||||
assert f'The file {test_file} has been edited' in obs.content, (
|
||||
f'Edit failed at iteration {i}'
|
||||
)
|
||||
logger.info(f'finished iteration {i}')
|
||||
|
||||
# Verify final file state using FileEditAction view command
|
||||
|
||||
@@ -240,7 +240,9 @@ def test_guess_success_rate_limit_wait_time(mock_litellm_completion, default_con
|
||||
wait_time = mock_sleep.call_args[0][0]
|
||||
assert (
|
||||
default_config.retry_min_wait <= wait_time <= default_config.retry_max_wait
|
||||
), f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
|
||||
), (
|
||||
f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
|
||||
)
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
|
||||
@@ -71,9 +71,9 @@ def test_pr_title_with_quotes(monkeypatch):
|
||||
data = kwargs.get('json', {})
|
||||
title = data.get('title', '')
|
||||
expected = "Fix issue #123: Issue with 'quotes' and \"double quotes\" and <class 'ValueError'>"
|
||||
assert (
|
||||
title == expected
|
||||
), f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
|
||||
assert title == expected, (
|
||||
f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
|
||||
)
|
||||
return MockResponse()
|
||||
|
||||
class MockGetResponse:
|
||||
@@ -98,7 +98,7 @@ def test_pr_title_with_quotes(monkeypatch):
|
||||
original_run = subprocess.run
|
||||
|
||||
def mock_run(*args, **kwargs):
|
||||
print(f"Running command: {args[0] if args else kwargs.get('args', [])}")
|
||||
print(f'Running command: {args[0] if args else kwargs.get("args", [])}')
|
||||
if isinstance(args[0], list) and args[0][0] == 'git':
|
||||
if 'push' in args[0]:
|
||||
return subprocess.CompletedProcess(
|
||||
|
||||
@@ -478,13 +478,14 @@ async def test_process_issue(
|
||||
mock_run_controller.return_value = test_case['run_controller_return']
|
||||
|
||||
# Patch the necessary functions and methods
|
||||
with patch(
|
||||
'openhands.resolver.resolve_issue.create_runtime', mock_create_runtime
|
||||
), patch(
|
||||
'openhands.resolver.resolve_issue.run_controller', mock_run_controller
|
||||
), patch.object(
|
||||
with (
|
||||
patch('openhands.resolver.resolve_issue.create_runtime', mock_create_runtime),
|
||||
patch('openhands.resolver.resolve_issue.run_controller', mock_run_controller),
|
||||
patch.object(
|
||||
resolver, 'complete_runtime', return_value={'git_patch': 'test patch'}
|
||||
), patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime:
|
||||
),
|
||||
patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime,
|
||||
):
|
||||
# Call the process_issue method
|
||||
result = await resolver.process_issue(issue, base_commit, handler_instance)
|
||||
|
||||
|
||||
@@ -142,9 +142,9 @@ index 9daeafb..b02def2 100644
|
||||
with open(dos_file, 'rb') as f:
|
||||
dos_content = f.read()
|
||||
|
||||
assert (
|
||||
b'\r\n' not in unix_content
|
||||
), 'Unix-style line endings were changed to DOS-style'
|
||||
assert b'\r\n' not in unix_content, (
|
||||
'Unix-style line endings were changed to DOS-style'
|
||||
)
|
||||
assert b'\r\n' in dos_content, 'DOS-style line endings were changed to Unix-style'
|
||||
|
||||
# Check if content was updated correctly
|
||||
|
||||
@@ -242,7 +242,9 @@ def test_guess_success_rate_limit_wait_time(mock_litellm_completion, default_con
|
||||
wait_time = mock_sleep.call_args[0][0]
|
||||
assert (
|
||||
default_config.retry_min_wait <= wait_time <= default_config.retry_max_wait
|
||||
), f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
|
||||
), (
|
||||
f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
|
||||
)
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
|
||||
@@ -72,9 +72,9 @@ def test_pr_title_with_quotes(monkeypatch):
|
||||
data = kwargs.get('json', {})
|
||||
title = data.get('title', '')
|
||||
expected = "Fix issue #123: Issue with 'quotes' and \"double quotes\" and <class 'ValueError'>"
|
||||
assert (
|
||||
title == expected
|
||||
), f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
|
||||
assert title == expected, (
|
||||
f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
|
||||
)
|
||||
return MockResponse()
|
||||
|
||||
class MockGetResponse:
|
||||
@@ -99,7 +99,7 @@ def test_pr_title_with_quotes(monkeypatch):
|
||||
original_run = subprocess.run
|
||||
|
||||
def mock_run(*args, **kwargs):
|
||||
logger.info(f"Running command: {args[0] if args else kwargs.get('args', [])}")
|
||||
logger.info(f'Running command: {args[0] if args else kwargs.get("args", [])}')
|
||||
if isinstance(args[0], list) and args[0][0] == 'git':
|
||||
if 'push' in args[0]:
|
||||
return subprocess.CompletedProcess(
|
||||
|
||||
@@ -506,15 +506,18 @@ async def test_process_issue(
|
||||
mock_run_controller.return_value = test_case['run_controller_return']
|
||||
|
||||
# Patch the necessary functions and methods
|
||||
with patch(
|
||||
'openhands.resolver.resolve_issue.create_runtime', mock_create_runtime
|
||||
), patch(
|
||||
'openhands.resolver.resolve_issue.run_controller', mock_run_controller
|
||||
), patch.object(
|
||||
with (
|
||||
patch('openhands.resolver.resolve_issue.create_runtime', mock_create_runtime),
|
||||
patch('openhands.resolver.resolve_issue.run_controller', mock_run_controller),
|
||||
patch.object(
|
||||
resolver, 'complete_runtime', return_value={'git_patch': 'test patch'}
|
||||
), patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime, patch(
|
||||
),
|
||||
patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime,
|
||||
patch(
|
||||
'openhands.resolver.resolve_issue.SandboxConfig', return_value=MagicMock()
|
||||
), patch('openhands.resolver.resolve_issue.AppConfig', return_value=MagicMock()):
|
||||
),
|
||||
patch('openhands.resolver.resolve_issue.AppConfig', return_value=MagicMock()),
|
||||
):
|
||||
# Call the process_issue method
|
||||
result = await resolver.process_issue(issue, base_commit, handler_instance)
|
||||
|
||||
|
||||
@@ -143,9 +143,9 @@ index 9daeafb..b02def2 100644
|
||||
with open(dos_file, 'rb') as f:
|
||||
dos_content = f.read()
|
||||
|
||||
assert (
|
||||
b'\r\n' not in unix_content
|
||||
), 'Unix-style line endings were changed to DOS-style'
|
||||
assert b'\r\n' not in unix_content, (
|
||||
'Unix-style line endings were changed to DOS-style'
|
||||
)
|
||||
assert b'\r\n' in dos_content, 'DOS-style line endings were changed to Unix-style'
|
||||
|
||||
# Check if content was updated correctly
|
||||
@@ -308,7 +308,7 @@ def test_update_existing_pull_request(
|
||||
)
|
||||
|
||||
# Assert: Check if the auto-generated comment was posted to the PR
|
||||
comment_url = f'https://gitlab.com/api/v4/projects/{quote(f'{issue.owner}/{issue.repo}', safe="")}/issues/{issue.number}/notes'
|
||||
comment_url = f'https://gitlab.com/api/v4/projects/{quote(f"{issue.owner}/{issue.repo}", safe="")}/issues/{issue.number}/notes'
|
||||
expected_comment = 'This is an issue resolution.'
|
||||
mock_requests_post.assert_called_once_with(
|
||||
comment_url,
|
||||
@@ -697,7 +697,7 @@ def test_reply_to_comment(mock_get, mock_post, mock_issue):
|
||||
|
||||
# Check that the correct request was made to the API
|
||||
mock_post.assert_called_once_with(
|
||||
f'https://gitlab.com/api/v4/projects/{quote(f'{mock_issue.owner}/{mock_issue.repo}', safe="")}/merge_requests/{mock_issue.number}/discussions/{comment_id.split('/')[-1]}/notes',
|
||||
f'https://gitlab.com/api/v4/projects/{quote(f"{mock_issue.owner}/{mock_issue.repo}", safe="")}/merge_requests/{mock_issue.number}/discussions/{comment_id.split("/")[-1]}/notes',
|
||||
headers={
|
||||
'Authorization': f'Bearer {token}',
|
||||
'Accept': 'application/json',
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import asyncio
|
||||
from contextlib import contextmanager
|
||||
from typing import Type
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
@@ -19,7 +18,7 @@ def test_llm():
|
||||
return _get_llm(LLM)
|
||||
|
||||
|
||||
def _get_llm(type_: Type[LLM]):
|
||||
def _get_llm(type_: type[LLM]):
|
||||
with _patch_http():
|
||||
return type_(config=config.get_llm_config())
|
||||
|
||||
@@ -82,7 +81,7 @@ async def test_acompletion_streaming(mock_response):
|
||||
async for chunk in test_llm.async_streaming_completion(
|
||||
messages=[{'role': 'user', 'content': 'Hello!'}], stream=True
|
||||
):
|
||||
print(f"Chunk: {chunk['choices'][0]['delta']['content']}")
|
||||
print(f'Chunk: {chunk["choices"][0]["delta"]["content"]}')
|
||||
# Assertions for streaming completion
|
||||
assert chunk['choices'][0]['delta']['content'] in [
|
||||
r['choices'][0]['delta']['content'] for r in mock_response
|
||||
@@ -187,7 +186,7 @@ async def test_async_streaming_completion_with_user_cancellation(cancel_after_ch
|
||||
messages=[{'role': 'user', 'content': 'Hello!'}], stream=True
|
||||
):
|
||||
received_chunks.append(chunk['choices'][0]['delta']['content'])
|
||||
print(f"Chunk: {chunk['choices'][0]['delta']['content']}")
|
||||
print(f'Chunk: {chunk["choices"][0]["delta"]["content"]}')
|
||||
|
||||
# Assert that we received the expected number of chunks before cancellation
|
||||
assert len(received_chunks) == cancel_after_chunks
|
||||
|
||||
@@ -23,21 +23,21 @@ def serialization_deserialization(
|
||||
original_action_dict, cls, max_message_chars: int = 10000
|
||||
):
|
||||
action_instance = event_from_dict(original_action_dict)
|
||||
assert isinstance(
|
||||
action_instance, Action
|
||||
), 'The action instance should be an instance of Action.'
|
||||
assert isinstance(
|
||||
action_instance, cls
|
||||
), f'The action instance should be an instance of {cls.__name__}.'
|
||||
assert isinstance(action_instance, Action), (
|
||||
'The action instance should be an instance of Action.'
|
||||
)
|
||||
assert isinstance(action_instance, cls), (
|
||||
f'The action instance should be an instance of {cls.__name__}.'
|
||||
)
|
||||
|
||||
# event_to_dict is the regular serialization of an event
|
||||
serialized_action_dict = event_to_dict(action_instance)
|
||||
|
||||
# it has an extra message property, for the UI
|
||||
serialized_action_dict.pop('message')
|
||||
assert (
|
||||
serialized_action_dict == original_action_dict
|
||||
), 'The serialized action should match the original action dict.'
|
||||
assert serialized_action_dict == original_action_dict, (
|
||||
'The serialized action should match the original action dict.'
|
||||
)
|
||||
|
||||
|
||||
def test_event_props_serialization_deserialization():
|
||||
|
||||
@@ -717,9 +717,9 @@ async def test_run_controller_max_iterations_has_metrics(
|
||||
== 'RuntimeError: Agent reached maximum iteration in headless mode. Current iteration: 3, max iteration: 3'
|
||||
)
|
||||
|
||||
assert (
|
||||
state.metrics.accumulated_cost == 10.0 * 3
|
||||
), f'Expected accumulated cost to be 30.0, but got {state.metrics.accumulated_cost}'
|
||||
assert state.metrics.accumulated_cost == 10.0 * 3, (
|
||||
f'Expected accumulated cost to be 30.0, but got {state.metrics.accumulated_cost}'
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -1434,14 +1434,14 @@ async def test_agent_controller_processes_null_observation_with_cause():
|
||||
|
||||
# Verify the NullObservation has a cause that points to the RecallAction
|
||||
assert null_observation.cause is not None, 'NullObservation cause is None'
|
||||
assert (
|
||||
null_observation.cause == recall_action.id
|
||||
), f'Expected cause={recall_action.id}, got cause={null_observation.cause}'
|
||||
assert null_observation.cause == recall_action.id, (
|
||||
f'Expected cause={recall_action.id}, got cause={null_observation.cause}'
|
||||
)
|
||||
|
||||
# Verify the controller's should_step method returns True for this observation
|
||||
assert controller.should_step(
|
||||
null_observation
|
||||
), 'should_step should return True for this NullObservation'
|
||||
assert controller.should_step(null_observation), (
|
||||
'should_step should return True for this NullObservation'
|
||||
)
|
||||
|
||||
# Verify the controller's step method was called
|
||||
# This means the controller processed the NullObservation
|
||||
@@ -1453,9 +1453,9 @@ async def test_agent_controller_processes_null_observation_with_cause():
|
||||
null_observation_zero._cause = 0 # type: ignore[attr-defined]
|
||||
|
||||
# Verify the controller's should_step method would return False for this observation
|
||||
assert not controller.should_step(
|
||||
null_observation_zero
|
||||
), 'should_step should return False for NullObservation with cause=0'
|
||||
assert not controller.should_step(null_observation_zero), (
|
||||
'should_step should return False for NullObservation with cause=0'
|
||||
)
|
||||
|
||||
|
||||
def test_agent_controller_should_step_with_null_observation_cause_zero(mock_agent):
|
||||
@@ -1481,9 +1481,9 @@ def test_agent_controller_should_step_with_null_observation_cause_zero(mock_agen
|
||||
result = controller.should_step(null_observation)
|
||||
|
||||
# It should return False since we only want to step on NullObservation with cause > 0
|
||||
assert (
|
||||
result is False
|
||||
), 'should_step should return False for NullObservation with cause = 0'
|
||||
assert result is False, (
|
||||
'should_step should return False for NullObservation with cause = 0'
|
||||
)
|
||||
|
||||
|
||||
def test_system_message_in_event_stream(mock_agent, test_event_stream):
|
||||
@@ -1563,8 +1563,8 @@ async def test_openrouter_context_window_exceeded_error(
|
||||
condensation_actions = [e for e in events if isinstance(e, CondensationAction)]
|
||||
|
||||
# There should be at least one CondensationAction if the error was handled correctly
|
||||
assert (
|
||||
len(condensation_actions) > 0
|
||||
), 'OpenRouter context window exceeded error was not handled correctly'
|
||||
assert len(condensation_actions) > 0, (
|
||||
'OpenRouter context window exceeded error was not handled correctly'
|
||||
)
|
||||
|
||||
await controller.close()
|
||||
|
||||
@@ -140,14 +140,14 @@ async def test_delegation_flow(mock_parent_agent, mock_child_agent, mock_event_s
|
||||
assert any(isinstance(event, AgentDelegateAction) for event in events)
|
||||
|
||||
# Verify that a delegate agent controller is created
|
||||
assert (
|
||||
parent_controller.delegate is not None
|
||||
), "Parent's delegate controller was not set."
|
||||
assert parent_controller.delegate is not None, (
|
||||
"Parent's delegate controller was not set."
|
||||
)
|
||||
|
||||
# The parent's iteration should have incremented
|
||||
assert (
|
||||
parent_controller.state.iteration == 1
|
||||
), 'Parent iteration should be incremented after step.'
|
||||
assert parent_controller.state.iteration == 1, (
|
||||
'Parent iteration should be incremented after step.'
|
||||
)
|
||||
|
||||
# Now simulate that the child increments local iteration and finishes its subtask
|
||||
delegate_controller = parent_controller.delegate
|
||||
@@ -160,14 +160,14 @@ async def test_delegation_flow(mock_parent_agent, mock_child_agent, mock_event_s
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Now the parent's delegate is None
|
||||
assert (
|
||||
parent_controller.delegate is None
|
||||
), 'Parent delegate should be None after child finishes.'
|
||||
assert parent_controller.delegate is None, (
|
||||
'Parent delegate should be None after child finishes.'
|
||||
)
|
||||
|
||||
# Parent's global iteration is updated from the child
|
||||
assert (
|
||||
parent_controller.state.iteration == 6
|
||||
), "Parent iteration should be the child's iteration + 1 after child is done."
|
||||
assert parent_controller.state.iteration == 6, (
|
||||
"Parent iteration should be the child's iteration + 1 after child is done."
|
||||
)
|
||||
|
||||
# Cleanup
|
||||
await parent_controller.close()
|
||||
|
||||
@@ -93,15 +93,20 @@ async def test_agent_session_start_with_no_state(mock_agent):
|
||||
memory.microagents_dir = 'test-dir'
|
||||
|
||||
# Patch AgentController and State.restore_from_session to fail; patch Memory in AgentSession
|
||||
with patch(
|
||||
with (
|
||||
patch(
|
||||
'openhands.server.session.agent_session.AgentController', SpyAgentController
|
||||
), patch(
|
||||
),
|
||||
patch(
|
||||
'openhands.server.session.agent_session.EventStream',
|
||||
return_value=mock_event_stream,
|
||||
), patch(
|
||||
),
|
||||
patch(
|
||||
'openhands.controller.state.state.State.restore_from_session',
|
||||
side_effect=Exception('No state found'),
|
||||
), patch('openhands.server.session.agent_session.Memory', return_value=memory):
|
||||
),
|
||||
patch('openhands.server.session.agent_session.Memory', return_value=memory),
|
||||
):
|
||||
await session.start(
|
||||
runtime_name='test-runtime',
|
||||
config=AppConfig(),
|
||||
@@ -181,15 +186,20 @@ async def test_agent_session_start_with_restored_state(mock_agent):
|
||||
mock_memory = MagicMock(spec=Memory)
|
||||
|
||||
# Patch AgentController and State.restore_from_session to succeed, patch Memory in AgentSession
|
||||
with patch(
|
||||
with (
|
||||
patch(
|
||||
'openhands.server.session.agent_session.AgentController', SpyAgentController
|
||||
), patch(
|
||||
),
|
||||
patch(
|
||||
'openhands.server.session.agent_session.EventStream',
|
||||
return_value=mock_event_stream,
|
||||
), patch(
|
||||
),
|
||||
patch(
|
||||
'openhands.controller.state.state.State.restore_from_session',
|
||||
return_value=mock_restored_state,
|
||||
), patch('openhands.server.session.agent_session.Memory', mock_memory):
|
||||
),
|
||||
patch('openhands.server.session.agent_session.Memory', mock_memory),
|
||||
):
|
||||
await session.start(
|
||||
runtime_name='test-runtime',
|
||||
config=AppConfig(),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user