Update pre-commit hook versions to most recent versions (#8343)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-08 22:38:05 -05:00 · 2025-05-07 23:59:13 -04:00
parent d5a8d4251c
commit 689d3c9046
296 changed files with 882 additions and 847 deletions
--- a/.github/scripts/check_version_consistency.py
+++ b/.github/scripts/check_version_consistency.py
@@ -2,10 +2,9 @@
 import os
 import re
 import sys
-from typing import Set, Tuple


-def find_version_references(directory: str) -> Tuple[Set[str], Set[str]]:
+def find_version_references(directory: str) -> tuple[set[str], set[str]]:
    openhands_versions = set()
    runtime_versions = set()

--- a/.openhands/pre-commit.sh
+++ b/.openhands/pre-commit.sh
--- a/dev_config/python/.pre-commit-config.yaml
+++ b/dev_config/python/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
        exclude: docs/modules/python
@@ -10,17 +10,17 @@ repos:
      - id: debug-statements

  - repo: https://github.com/tox-dev/pyproject-fmt
-    rev: 1.7.0
+    rev: v2.5.1
    hooks:
      - id: pyproject-fmt
  - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.16
+    rev: v0.24.1
    hooks:
      - id: validate-pyproject

  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
-    rev: v0.4.1
+    rev: v0.11.8
    hooks:
      # Run the linter.
      - id: ruff
@@ -33,7 +33,7 @@ repos:
        types_or: [python, pyi, jupyter]

  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.9.0
+    rev: v1.15.0
    hooks:
      - id: mypy
        additional_dependencies:
--- a/dev_config/python/ruff.toml
+++ b/dev_config/python/ruff.toml
@@ -20,6 +20,12 @@ ignore = [
    "B010",
    "B904",
    "B018",
+    # Temporarily ignore ASYNC rules until they can be properly fixed in a separate PR
+    "ASYNC110",
+    "ASYNC220",
+    "ASYNC221",
+    "ASYNC230",
+    "ASYNC251",
 ]

 [lint.flake8-quotes]
--- a/evaluation/benchmarks/EDA/game.py
+++ b/evaluation/benchmarks/EDA/game.py
@@ -73,7 +73,7 @@ class Q20Game:
        usr_msg = self.answerer(guesser_question)

        self.guesser_messages.append(
-            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
+            {'role': 'user', 'content': f'{usr_msg["content"].strip()}'}
        )

        if 'bingo' in usr_msg['content'].lower():
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -67,7 +67,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -100,7 +100,7 @@ def initialize_runtime(
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert obs.exit_code == 0

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -113,7 +113,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    agent_answer = None
@@ -165,7 +165,7 @@ def complete_runtime(
            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
            final_ans = obs.content

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return {
        'final_ans': final_ans,
        'agent_answer': agent_answer,
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -80,7 +80,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
+    logger.info(f'\n{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}\n')
    obs: CmdOutputObservation

    # Set instance id
@@ -110,7 +110,7 @@ def initialize_runtime(
                file_path,
                '/workspace',
            )
-    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
+    logger.info(f'\n{"-" * 50} END Runtime Initialization Fn {"-" * 50}\n')


 def complete_runtime(
@@ -123,7 +123,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
+    logger.info(f'\n{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}\n')
    obs: CmdOutputObservation

    # Rewriting the test file to ignore any changes Agent may have made.
@@ -147,7 +147,7 @@ def complete_runtime(
    if isinstance(obs, CmdOutputObservation):
        exit_code = obs.exit_code

-    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+    logger.info(f'\n{"-" * 50} END Runtime Completion Fn {"-" * 50}\n')

    runtime.close()

--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -84,7 +84,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    file_ext = FILE_EXT_MAP[instance.language.lower()]
@@ -128,7 +128,7 @@ def initialize_runtime(
    assert obs.exit_code == 0

    # download repository archive
-    repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
+    repository_url = f'https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split("/")[1]}.zip'
    action = CmdRunAction(command='wget -O repo.zip ' + repository_url)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
@@ -160,7 +160,7 @@ def initialize_runtime(
    obs = runtime.run_action(action)
    assert obs.exit_code == 0, f'Failed to remove the code: {obs.content}'

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -173,7 +173,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    test_result = {'result': {}, 'metadata': {}}
@@ -233,7 +233,7 @@ def complete_runtime(
        test_result['metadata']['2_run_test_success'] = False
        test_result['metadata']['2_run_test_result'] = str(obs.content)

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return test_result


@@ -258,7 +258,7 @@ def process_instance(
    instruction = (
        f'Please complete the function "{instance.signature}" in the file /workspace/{instance.repository.split("/")[1]}/{instance.filePath}.\n'
        f'The environment has been set up for you to start working. You may assume all necessary tools are installed.\n'
-        f'To complete the task, you must directly modify the file and fill in the function, keeping in mind that the function signature is on line {instance.lineStart-1}\n\n'
+        f'To complete the task, you must directly modify the file and fill in the function, keeping in mind that the function signature is on line {instance.lineStart - 1}\n\n'
        f'The function should do the following:\n'
        f'{instance.promptSummaryOnly}\n\n'
    )
--- a/evaluation/benchmarks/biocoder/scripts/setup/remove_code.py
+++ b/evaluation/benchmarks/biocoder/scripts/setup/remove_code.py
@@ -44,7 +44,7 @@ def remove_code(target_filepath: str, line_start: int, line_end: int, language:
        lines = (
            lines[:line_start]
            + [
-                f"{' '*comment_indent_size+comment_prefix[language.lower()]}TODO: replace with your code here"
+                f'{" " * comment_indent_size + comment_prefix[language.lower()]}TODO: replace with your code here'
            ]
            + ([''] * 2)
            + lines[line_end:]
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -184,7 +184,7 @@ def load_bird():
                    .fetchall()
                )
                prompt += (
-                    f"/*\n3 example rows:\n{top_k_row_query}\n{'    '.join(headers)}\n"
+                    f'/*\n3 example rows:\n{top_k_row_query}\n{"    ".join(headers)}\n'
                )
                for row in top_k_rows:
                    row = [str(x) for x in row]
@@ -201,10 +201,10 @@ def load_bird():

        # Extract the CREATE TABLE statements and sample data from the database
        prompt = _extract_create_table_prompt(db_path)
-        prompt += f"-- External Knowledge: {e['evidence']}\n\n"
+        prompt += f'-- External Knowledge: {e["evidence"]}\n\n'
        prompt += '-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n'
        prompt += '-- Using valid SQLite, answer the following questions for the tables provided above.\n'
-        prompt += f"Question: {e['question']}\n"
+        prompt += f'Question: {e["question"]}\n'

        return prompt

@@ -224,7 +224,7 @@ def load_bird():
                    item = {
                        'instance_id': f'{len(processed_data)}',
                        'db_path': os.path.join(
-                            database_path, e['db_id'], f"{e['db_id']}.sqlite"
+                            database_path, e['db_id'], f'{e["db_id"]}.sqlite'
                        ),
                        'db_id': e['db_id'],
                        'instruction': _create_prompt(e, database_path),
@@ -253,7 +253,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Copy the database to the workspace
@@ -273,7 +273,7 @@ def initialize_runtime(
    assert obs.exit_code == 0
    assert f'{instance.db_id}.sqlite' in obs.content

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -286,7 +286,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation
    timeout = 30

@@ -343,7 +343,7 @@ def complete_runtime(
        'gen_sql': gen_sql,
        'gold_sql': gold_sql,
    }
-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return test_result


--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -34,9 +34,9 @@ SUPPORTED_AGENT_CLS = {'CodeActAgent'}
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
-    assert (
-        metadata.max_iterations == 1
-    ), 'max_iterations must be 1 for browsing delegation evaluation.'
+    assert metadata.max_iterations == 1, (
+        'max_iterations must be 1 for browsing delegation evaluation.'
+    )
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    config = AppConfig(
--- a/evaluation/benchmarks/commit0/run_infer.py
+++ b/evaluation/benchmarks/commit0/run_infer.py
@@ -82,9 +82,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):

    if RUN_WITH_BROWSING:
        instruction += (
-            '<IMPORTANT!>\n'
-            'You SHOULD NEVER attempt to browse the web. '
-            '</IMPORTANT!>\n'
+            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
        )
    return instruction

@@ -265,7 +263,7 @@ def complete_runtime(

    test_dir = instance['test']['test_dir']
    action = CmdRunAction(
-        command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
+        command=f'{instance["test"]["test_cmd"]} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1'
    )
    action.set_hard_timeout(600)
    logger.info(action, extra={'msg_type': 'ACTION'})
--- a/evaluation/benchmarks/discoverybench/eval_utils/eval_w_subhypo_gen.py
+++ b/evaluation/benchmarks/discoverybench/eval_utils/eval_w_subhypo_gen.py
@@ -489,7 +489,7 @@ def run_eval_gold_vs_gen_NL_hypo_workflow(
                gen_subh_to_gold_subh[p_id] = g_id
                gold_subh_covered.append(g_id)
                gen_gold_subh_to_context[f'P{p_id}||G{g_id}'] = {
-                    'question': f"""Comapring: GoldH: {gold_subh["text"]}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""",
+                    'question': f"""Comapring: GoldH: {gold_subh['text']}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""",
                    'answer': context_bool,
                    'score': context_score,
                }
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -145,7 +145,7 @@ def initialize_runtime(runtime: Runtime, data_files: list[str]):

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    action = CmdRunAction(command='mkdir -p /workspace')
@@ -170,7 +170,7 @@ def initialize_runtime(runtime: Runtime, data_files: list[str]):
        obs = runtime.run_action(action)
        assert obs.exit_code == 0

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def get_last_agent_finish_action(state: State) -> AgentFinishAction:
--- a/evaluation/benchmarks/gaia/get_score.py
+++ b/evaluation/benchmarks/gaia/get_score.py
@@ -21,7 +21,7 @@ def main():
        total += 1
        if out['test_result']['score']:
            success += 1
-    print(f'Success rate: {success}/{total} = {success/total}')
+    print(f'Success rate: {success}/{total} = {success / total}')


 if __name__ == '__main__':
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -78,7 +78,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    action = CmdRunAction(command='mkdir -p /workspace')
@@ -110,7 +110,7 @@ def initialize_runtime(
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def process_instance(
@@ -134,10 +134,10 @@ def process_instance(
        dest_file = None

    # Prepare instruction
-    instruction = f"{instance['Question']}\n"
+    instruction = f'{instance["Question"]}\n'
    logger.info(f'Instruction: {instruction}')
    if dest_file:
-        instruction += f"\n\nThe mentioned file is provided in the workspace at: {dest_file.split('/')[-1]}"
+        instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'

    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
    instruction += 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
--- a/evaluation/benchmarks/gaia/scorer.py
+++ b/evaluation/benchmarks/gaia/scorer.py
@@ -21,7 +21,7 @@ def split_string(
 ) -> list[str]:
    if char_list is None:
        char_list = [',', ';']
-    pattern = f"[{''.join(char_list)}]"
+    pattern = f'[{"".join(char_list)}]'
    return re.split(pattern, s)


--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -112,7 +112,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    action = CmdRunAction(command='mkdir -p /workspace')
@@ -143,7 +143,7 @@ def initialize_runtime(
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -156,7 +156,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    # default value
@@ -190,7 +190,7 @@ def complete_runtime(
        'timeout': timeout,
        'num_workers': num_workers,
    }
-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return test_result


--- a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
@@ -73,7 +73,7 @@ def run_eval(
    runtime: Runtime,
 ):
    """Run the evaluation and create report"""
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    lca_path = bench_config['LCA_PATH']
@@ -146,7 +146,7 @@ def run_eval(
    obs = runtime.run_action(action)
    report_str = obs.content

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
    return report_str


--- a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
@@ -95,7 +95,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    lca_path = bench_config['LCA_PATH']
@@ -177,7 +177,7 @@ def initialize_runtime(
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -190,7 +190,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    model_name = bench_config['model_name']
@@ -227,7 +227,7 @@ def complete_runtime(
    obs = runtime.run_action(action)
    result = json.loads(obs.content)

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')

    return result

@@ -313,7 +313,7 @@ Phase 7. VERIFICATION: Test your implementation thoroughly.
     7.2.3 The functions you changed
   7.4 If any tests fail, revise your implementation until all tests pass

-Phase 8. REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance["sha_fail"]}.
+Phase 8. REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['sha_fail']}.
   8.1 Ensure you've fully addressed all requirements.

 Once all phases are done, announce: 'Agent Task Complete'.
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -141,7 +141,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -174,7 +174,7 @@ def initialize_runtime(
    ipynb_obs = runtime.run_action(action)
    logger.info(ipynb_obs, extra={'msg_type': 'OBSERVATION'})

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 # Prepare instruction
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@@ -82,7 +82,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -103,7 +103,7 @@ def initialize_runtime(
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
    return goal, obs


@@ -116,7 +116,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
@@ -124,7 +124,7 @@ def complete_runtime(
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return {
        'rewards': json.loads(obs.content),
    }
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -130,7 +130,7 @@ def initialize_runtime(runtime: Runtime):

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -144,7 +144,7 @@ def initialize_runtime(runtime: Runtime):
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def process_instance(
--- a/evaluation/benchmarks/ml_bench/run_analysis.py
+++ b/evaluation/benchmarks/ml_bench/run_analysis.py
@@ -93,7 +93,7 @@ def classify_error(llm: LLM, failed_case: dict) -> str:
        error_category = response.choices[0].message['content']
    except Exception as e:
        logger.error(
-            f"Failed to classify the error for the failed case: {failed_case['instance_id']}"
+            f'Failed to classify the error for the failed case: {failed_case["instance_id"]}'
        )
        logger.error(e)
        error_category = input(
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -103,7 +103,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -137,7 +137,7 @@ def initialize_runtime(
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -150,7 +150,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    repo_url = instance['github']
@@ -199,7 +199,7 @@ def complete_runtime(
        outputs['success'] = 1
    outputs['eval_exit_code'] = obs.exit_code

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return outputs


--- a/evaluation/benchmarks/multi_swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/eval_infer.py
@@ -120,9 +120,9 @@ def process_instance(
    """
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        assert (
-            log_dir is not None
-        ), "Can't reset logger without a provided log directory."
+        assert log_dir is not None, (
+            "Can't reset logger without a provided log directory."
+        )
        os.makedirs(log_dir, exist_ok=True)
        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
@@ -289,7 +289,7 @@ def process_instance(
                            )
                            report = _report[instance_id]
                            logger.info(
-                                f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                                f'[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report["resolved"]}'
                            )
                            instance['test_result']['report']['resolved'] = report[
                                'resolved'
@@ -365,9 +365,9 @@ if __name__ == '__main__':
                for line in tqdm(f, desc='Loading predictions')
            ]
        )
-    assert (
-        'instance_id' in predictions.columns
-    ), 'Input file must contain instance_id column.'
+    assert 'instance_id' in predictions.columns, (
+        'Input file must contain instance_id column.'
+    )

    if 'model_patch' not in predictions.columns and (
        'test_result' in predictions.columns
@@ -376,17 +376,17 @@ if __name__ == '__main__':
        raise ValueError(
            'Input file must contain model_patch column OR test_result column with model_patch field.'
        )
-    assert len(predictions['instance_id'].unique()) == len(
-        predictions
-    ), 'instance_id column must be unique.'
+    assert len(predictions['instance_id'].unique()) == len(predictions), (
+        'instance_id column must be unique.'
+    )

    if 'model_patch' not in predictions.columns:
        predictions['model_patch'] = predictions['test_result'].apply(
            lambda x: x.get('git_patch', '')
        )
-    assert {'instance_id', 'model_patch'}.issubset(
-        set(predictions.columns)
-    ), 'Input file must contain instance_id and model_patch columns.'
+    assert {'instance_id', 'model_patch'}.issubset(set(predictions.columns)), (
+        'Input file must contain instance_id and model_patch columns.'
+    )

    # Process model_patch
    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
--- a/evaluation/benchmarks/multi_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py
@@ -103,21 +103,21 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
            f'<issue_description>\n'
            f'{instance.problem_statement}\n'
            '</issue_description>\n\n'
-            "Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n"
+            'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
            "I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
            "Also the development Java environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
-            "Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n"
-            "Follow these steps to resolve the issue:\n"
-            "1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n"
+            'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
+            'Follow these steps to resolve the issue:\n'
+            '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
            '2. Create a Java class to reproduce the error and execute it by first compiling with `javac <classname>.java` and then running with `java <classname>` using the BashTool, to confirm the error\n'
-            "3. Edit the sourcecode of the repo to resolve the issue.\n"
-            "4. Rerun your reproduce script or class and confirm that the error is fixed!\n"
-            "5. Think about edgecases, add comprehensive tests for them in your reproduce class or script, and run them to make sure your fix handles these cases as well.\n"
-            f"6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance['base_commit']}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n"
-            "   - The issue you are fixing\n"
-            "   - The files you modified\n"
-            "   - The functions or classes you changed\n"
-            "   Make sure all these tests pass with your changes.\n"
+            '3. Edit the sourcecode of the repo to resolve the issue.\n'
+            '4. Rerun your reproduce script or class and confirm that the error is fixed!\n'
+            '5. Think about edgecases, add comprehensive tests for them in your reproduce class or script, and run them to make sure your fix handles these cases as well.\n'
+            f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
+            '   - The issue you are fixing\n'
+            '   - The files you modified\n'
+            '   - The functions or classes you changed\n'
+            '   Make sure all these tests pass with your changes.\n'
            "Your thinking should be thorough and so it's fine if it's very long.\n"
        ),
        'go': (
@@ -275,9 +275,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):

    if instruction and RUN_WITH_BROWSING:
        instruction += (
-            '<IMPORTANT!>\n'
-            'You SHOULD NEVER attempt to browse the web. '
-            '</IMPORTANT!>\n'
+            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
        )
    return instruction

--- a/evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
+++ b/evaluation/benchmarks/multi_swe_bench/scripts/data/data_change.py
@@ -3,9 +3,10 @@ import json
 input_file = 'XXX.jsonl'
 output_file = 'YYY.jsonl'

-with open(input_file, 'r', encoding='utf-8') as fin, open(
-    output_file, 'w', encoding='utf-8'
-) as fout:
+with (
+    open(input_file, 'r', encoding='utf-8') as fin,
+    open(output_file, 'w', encoding='utf-8') as fout,
+):
    for line in fin:
        line = line.strip()
        if not line:
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -92,7 +92,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set up workspace directories
@@ -123,7 +123,7 @@ def initialize_runtime(
    assert obs.exit_code == 0
    assert dataset_name in obs.content

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def complete_runtime(
@@ -136,7 +136,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    test_result = {}
@@ -156,7 +156,7 @@ def complete_runtime(
    else:
        test_result = {'program': 'ERROR'}

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return test_result


--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -129,15 +129,15 @@ def process_instance(

        AssertionError: if `conditional_imports` is not provided.
    """
-    assert (
-        conditional_imports is not None
-    ), 'conditional_imports must be provided to run process_instance using multiprocessing'
+    assert conditional_imports is not None, (
+        'conditional_imports must be provided to run process_instance using multiprocessing'
+    )

    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
-        assert (
-            log_dir is not None
-        ), "Can't reset logger without a provided log directory."
+        assert log_dir is not None, (
+            "Can't reset logger without a provided log directory."
+        )
        os.makedirs(log_dir, exist_ok=True)
        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
@@ -319,7 +319,7 @@ def process_instance(
                            )
                            report = _report[instance_id]
                            logger.info(
-                                f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                                f'[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report["resolved"]}'
                            )
                            instance['test_result']['report']['resolved'] = report[
                                'resolved'
@@ -418,9 +418,9 @@ if __name__ == '__main__':
                for line in tqdm(f, desc='Loading predictions')
            ]
        )
-    assert (
-        'instance_id' in predictions.columns
-    ), 'Input file must contain instance_id column.'
+    assert 'instance_id' in predictions.columns, (
+        'Input file must contain instance_id column.'
+    )

    if 'model_patch' not in predictions.columns and (
        'test_result' in predictions.columns
@@ -429,17 +429,17 @@ if __name__ == '__main__':
        raise ValueError(
            'Input file must contain model_patch column OR test_result column with model_patch field.'
        )
-    assert len(predictions['instance_id'].unique()) == len(
-        predictions
-    ), 'instance_id column must be unique.'
+    assert len(predictions['instance_id'].unique()) == len(predictions), (
+        'instance_id column must be unique.'
+    )

    if 'model_patch' not in predictions.columns:
        predictions['model_patch'] = predictions['test_result'].apply(
            lambda x: x.get('git_patch', '')
        )
-    assert {'instance_id', 'model_patch'}.issubset(
-        set(predictions.columns)
-    ), 'Input file must contain instance_id and model_patch columns.'
+    assert {'instance_id', 'model_patch'}.issubset(set(predictions.columns)), (
+        'Input file must contain instance_id and model_patch columns.'
+    )

    # Process model_patch
    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -160,7 +160,7 @@ Phase 7. VERIFICATION: Test your implementation thoroughly.
   7.2 Add edge cases to your test script to ensure comprehensive coverage.
   7.3 Run existing tests related to the modified code to ensure you haven't broken anything.

-8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance["base_commit"]}.
+8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['base_commit']}.
   8.1 Ensure you've fully addressed all requirements.
   8.2 Run any tests in the repository related to:
     8.2.1 The issue you are fixing
@@ -173,16 +173,14 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think

    if RUN_WITH_BROWSING:
        instruction += (
-            '<IMPORTANT!>\n'
-            'You SHOULD NEVER attempt to browse the web. '
-            '</IMPORTANT!>\n'
+            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
        )

    if 'image_assets' in instance:
        assets = json.loads(instance['image_assets'])
-        assert (
-            'problem_statement' in assets
-        ), 'problem_statement is required in image_assets'
+        assert 'problem_statement' in assets, (
+            'problem_statement is required in image_assets'
+        )
        image_urls = assets['problem_statement']
        return MessageAction(content=instruction, image_urls=image_urls)
    return MessageAction(content=instruction)
--- a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
@@ -137,7 +137,7 @@ for repo, diff in repo_diffs:
    is_significant = diff >= threshold
    repo_color = 'red' if is_significant else 'yellow'

-    print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
+    print(f'\n{colored(repo, repo_color, attrs=["bold"])}:')
    print(
        colored(
            f'Difference: {diff} instances! (Larger diff = Y better)',
--- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
@@ -44,17 +44,17 @@ def convert_history_to_str(history):
        if isinstance(event, list):
            # "event" is a legacy pair of (action, observation)
            event_obj = event_from_dict(event[0])
-            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += f'## {i + 1}| {event_obj.__class__.__name__}\n\n'
            ret += str(event_obj)
            ret += separator

            event_obj = event_from_dict(event[1])
-            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += f'## {i + 1}| {event_obj.__class__.__name__}\n\n'
            ret += str(event_obj)
        else:
            # "event" is a single event
            event_obj = event_from_dict(event)
-            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += f'## {i + 1}| {event_obj.__class__.__name__}\n\n'
            ret += str(event_obj)
    return ret

@@ -105,12 +105,12 @@ def convert_tool_call_to_string(tool_call: dict) -> str:
    if tool_call['type'] != 'function':
        raise ValueError("Tool call type must be 'function'.")

-    ret = f"<function={tool_call['function']['name']}>\n"
+    ret = f'<function={tool_call["function"]["name"]}>\n'
    try:
        args = json.loads(tool_call['function']['arguments'])
    except json.JSONDecodeError as e:
        raise ValueError(
-            f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
+            f'Failed to parse arguments as JSON. Arguments: {tool_call["function"]["arguments"]}'
        ) from e
    for param_name, param_value in args.items():
        is_multiline = isinstance(param_value, str) and '\n' in param_value
--- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
@@ -263,38 +263,38 @@ if __name__ == '__main__':
            # Print detailed results for single file
            print(f'\nResults for {args.input_path}:')
            print(
-                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
+                f'Number of resolved: {result["resolved"]["count"]} / {result["total_instances"]} ({result["resolved"]["percentage"]:.2f}% [{result["resolved"]["ci"][0]:.2f}%, {result["resolved"]["ci"][1]:.2f}%])'
            )
            print(
-                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
+                f'Number of empty patch: {result["empty_patches"]["count"]} / {result["total_instances"]} ({result["empty_patches"]["percentage"]:.2f}%)'
            )
            print(
-                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
+                f'Number of error lines: {result["errors"]["total"]} / {result["total_instances"]} ({result["errors"]["percentage"]:.2f}%)'
            )
            print(
-                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
+                f'Number of agent stuck in loop: {result["errors"]["stuck_in_loop"]["count"]} / {result["total_instances"]} ({result["errors"]["stuck_in_loop"]["percentage"]:.2f}%)'
            )
            print(
-                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
+                f'Number of unfinished runs: {result["unfinished_runs"]["count"]} / {result["total_instances"]} ({result["unfinished_runs"]["percentage"]:.2f}%)'
            )
-            print(f"Total cost: {result['costs']['total']:.2f} USD")
+            print(f'Total cost: {result["costs"]["total"]:.2f} USD')
            print('## Statistics')
            print(
-                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
+                f'Avg. num of turns per instance: {result["statistics"]["avg_turns"]:.2f}'
            )
            print(
-                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
+                f'Avg. agent cost per instance: {result["statistics"]["costs"]["main_agent"]:.2f} USD'
            )
            print(
-                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
+                f'Avg. editor cost per instance: {result["statistics"]["costs"]["editor"]:.2f} USD'
            )
            print(
-                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
+                f'Avg. total cost per instance: {result["statistics"]["costs"]["total"]:.2f} USD'
            )

            print('## Detailed error breakdown:')
            for error, data in result['errors']['breakdown'].items():
-                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
+                print(f'{error}: {data["count"]} ({data["percentage"]:.2f}%)')

        except Exception as e:
            print(f'Error processing {args.input_path}: {str(e)}')
--- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
@@ -34,16 +34,16 @@ if os.path.exists(swebench_official_report_json):
        report = json.load(f)

    output_md = (
-        "# SWE-bench Report\n"
-        "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
-        "## Summary\n"
-        f"- total instances: {report['total_instances']}\n"
-        f"- submitted instances: {report['submitted_instances']}\n"
-        f"- completed instances: {report['completed_instances']}\n"
-        f"- empty patch instances: {report['empty_patch_instances']}\n"
-        f"- resolved instances: {report['resolved_instances']}\n"
-        f"- unresolved instances: {report['unresolved_instances']}\n"
-        f"- error instances: {report['error_instances']}\n"
+        '# SWE-bench Report\n'
+        'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
+        '## Summary\n'
+        f'- total instances: {report["total_instances"]}\n'
+        f'- submitted instances: {report["submitted_instances"]}\n'
+        f'- completed instances: {report["completed_instances"]}\n'
+        f'- empty patch instances: {report["empty_patch_instances"]}\n'
+        f'- resolved instances: {report["resolved_instances"]}\n'
+        f'- unresolved instances: {report["unresolved_instances"]}\n'
+        f'- error instances: {report["error_instances"]}\n'
    )

    output_md += '\n## Resolved Instances\n'
@@ -111,12 +111,12 @@ elif os.path.exists(openhands_remote_report_jsonl):
    print(f'Total instances in eval report: {n_eval_instances}')

    # Verify no duplicates
-    assert (
-        len(instance_ids) == n_instances
-    ), 'Duplicate instance ids found in original output'
-    assert (
-        len(eval_instance_ids) == n_eval_instances
-    ), 'Duplicate instance ids found in eval report'
+    assert len(instance_ids) == n_instances, (
+        'Duplicate instance ids found in original output'
+    )
+    assert len(eval_instance_ids) == n_eval_instances, (
+        'Duplicate instance ids found in eval report'
+    )

    # Initialize counters
    stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0}
@@ -152,7 +152,7 @@ elif os.path.exists(openhands_remote_report_jsonl):

    # Generate markdown report
    def _instance_id_to_log_path(instance_id):
-        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
+        path = f'{args.input_file.replace(".jsonl", ".swebench_eval.logs")}/instance_{instance_id}.log'
        return os.path.relpath(path, start=dirname)

    # ... rest of markdown generation code remains the same ...
@@ -228,9 +228,10 @@ if os.path.exists(args.input_file + '.bak'):
 os.rename(args.input_file, args.input_file + '.bak')

 # Process and write file row by row
-with open(args.input_file + '.bak', 'r') as infile, open(
-    args.input_file, 'w'
-) as outfile:
+with (
+    open(args.input_file + '.bak', 'r') as infile,
+    open(args.input_file, 'w') as outfile,
+):
    for line in tqdm(infile, desc='Updating output file'):
        data = json.loads(line)
        instance_id = data['instance_id']
--- a/evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
@@ -20,7 +20,7 @@ def verify_instance_costs(row: pd.Series) -> float:
    try:
        metrics = row.get('metrics')
        if not metrics:
-            logger.warning(f"Instance {row['instance_id']}: No metrics found")
+            logger.warning(f'Instance {row["instance_id"]}: No metrics found')
            return 0.0

        accumulated = metrics.get('accumulated_cost')
@@ -28,7 +28,7 @@ def verify_instance_costs(row: pd.Series) -> float:

        if accumulated is None:
            logger.warning(
-                f"Instance {row['instance_id']}: No accumulated_cost in metrics"
+                f'Instance {row["instance_id"]}: No accumulated_cost in metrics'
            )
            return 0.0

@@ -41,8 +41,8 @@ def verify_instance_costs(row: pd.Series) -> float:
            if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
                has_duplicate = True
                logger.debug(
-                    f"Instance {row['instance_id']}: Possible buggy double-counting detected! "
-                    f"Steps {i} and {i+1} have identical costs: {costs[i]['cost']:.2f}"
+                    f'Instance {row["instance_id"]}: Possible buggy double-counting detected! '
+                    f'Steps {i} and {i + 1} have identical costs: {costs[i]["cost"]:.2f}'
                )
            else:
                all_pairs_match = False
@@ -64,15 +64,15 @@ def verify_instance_costs(row: pd.Series) -> float:

        if not abs(total_cost - accumulated) < 1e-6:
            logger.warning(
-                f"Instance {row['instance_id']}: Cost mismatch: "
-                f"accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, "
+                f'Instance {row["instance_id"]}: Cost mismatch: '
+                f'accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, '
            )

        return total_cost

    except Exception as e:
        logger.error(
-            f"Error verifying costs for instance {row.get('instance_id', 'UNKNOWN')}: {e}"
+            f'Error verifying costs for instance {row.get("instance_id", "UNKNOWN")}: {e}'
        )
        return 0.0

--- a/evaluation/benchmarks/swe_bench/scripts/swegym/convert_data.ipynb
+++ b/evaluation/benchmarks/swe_bench/scripts/swegym/convert_data.ipynb
@@ -46,7 +46,7 @@
    "for FILE_PATH in FILE_PATHS:\n",
    "    with gzip.open(FILE_PATH, 'rb') as f:  # Use 'rb' for gzipped files\n",
    "        for i, line in tqdm(\n",
-    "            enumerate(f), desc=f\"Processing {FILE_PATH.split('/')[-1]}\"\n",
+    "            enumerate(f), desc=f'Processing {FILE_PATH.split(\"/\")[-1]}'\n",
    "        ):\n",
    "            # Parse only the fields we need\n",
    "            raw_data = json.loads(line)\n",
--- a/evaluation/benchmarks/testgeneval/eval_infer.py
+++ b/evaluation/benchmarks/testgeneval/eval_infer.py
@@ -54,9 +54,9 @@ logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')

 def get_config(instance: pd.Series) -> AppConfig:
    base_container_image = get_instance_docker_image(instance['instance_id_swebench'])
-    assert (
-        base_container_image
-    ), f"Invalid container image for instance {instance['instance_id_swebench']}."
+    assert base_container_image, (
+        f'Invalid container image for instance {instance["instance_id_swebench"]}.'
+    )
    logger.info(f'Using instance container image: {base_container_image}.')
    return AppConfig(
        run_as_openhands=False,
@@ -183,9 +183,9 @@ def run_mutation_testing(
    mutation_action = CmdRunAction(command=f'cat {log_file}')
    mutation_action.set_hard_timeout(300)
    mutation_obs = runtime.run_action(mutation_action)
-    assert isinstance(
-        mutation_obs, CmdOutputObservation
-    ), 'Failed to retrieve mutation output.'
+    assert isinstance(mutation_obs, CmdOutputObservation), (
+        'Failed to retrieve mutation output.'
+    )
    return mutation_obs.exit_code, mutation_obs.content


@@ -294,9 +294,9 @@ def process_instance(
        AssertionError: if the `reset_logger` flag is set without a provided log directory.
    """
    if reset_logger:
-        assert (
-            log_dir is not None
-        ), "Can't reset logger without a provided log directory."
+        assert log_dir is not None, (
+            "Can't reset logger without a provided log directory."
+        )
        os.makedirs(log_dir, exist_ok=True)
        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
@@ -528,9 +528,9 @@ if __name__ == '__main__':
    # Load predictions
    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
    predictions = pd.read_json(args.input_file, lines=True)
-    assert (
-        'instance_id' in predictions.columns
-    ), 'Input file must contain instance_id column.'
+    assert 'instance_id' in predictions.columns, (
+        'Input file must contain instance_id column.'
+    )

    if 'test_suite' not in predictions.columns and (
        'test_result' in predictions.columns
@@ -562,9 +562,9 @@ if __name__ == '__main__':
            lambda x: x['test_suite']
        )

-    assert len(predictions['instance_id'].unique()) == len(
-        predictions
-    ), 'instance_id column must be unique.'
+    assert len(predictions['instance_id'].unique()) == len(predictions), (
+        'instance_id column must be unique.'
+    )

    assert {'instance_id_swebench', 'test_suite', 'instance_id'}.issubset(
        set(predictions.columns)
--- a/evaluation/benchmarks/testgeneval/metrics.py
+++ b/evaluation/benchmarks/testgeneval/metrics.py
@@ -1,5 +1,5 @@
 import sys
-from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union
+from typing import Callable, Optional, Sequence, TypeVar, Union

 import nltk
 import numpy as np
@@ -11,7 +11,7 @@ if sys.getrecursionlimit() < 10_000:
    sys.setrecursionlimit(10_000)


-def bleu(gold: List[str], pred: List[str]) -> float:
+def bleu(gold: list[str], pred: list[str]) -> float:
    """
    Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.

@@ -29,7 +29,7 @@ def bleu(gold: List[str], pred: List[str]) -> float:
    )


-def batch_bleu(golds: List[List[str]], preds: List[List[str]]) -> List[float]:
+def batch_bleu(golds: list[list[str]], preds: list[list[str]]) -> list[float]:
    """
    Calculate BLEU score for a batch of sentences.

@@ -42,7 +42,7 @@ def batch_bleu(golds: List[List[str]], preds: List[List[str]]) -> List[float]:
    return [bleu(gold, pred) for gold, pred in zip(golds, preds)]


-def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:
+def corpus_bleu(golds: list[list[str]], preds: list[list[str]]) -> float:
    """
    Calculate corpus-level BLEU score for a batch of sentences.

@@ -61,7 +61,7 @@ def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:


 def edit_sim(
-    gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = ' '
+    gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
 ) -> float:
    """
    Calculate char-level edit similarity, in the range of 0~100.
@@ -81,10 +81,10 @@ def edit_sim(


 def batch_edit_sim(
-    golds: List[Union[str, List[str]]],
-    preds: List[Union[str, List[str]]],
+    golds: list[Union[str, list[str]]],
+    preds: list[Union[str, list[str]]],
    sep: str = ' ',
-) -> List[float]:
+) -> list[float]:
    """
    Calculate char-level edit similarity for a batch of sentences.

@@ -114,7 +114,7 @@ def exact_match(gold: T, pred: T) -> float:
    return 100.0 if gold == pred else 0.0


-def batch_exact_match(golds: List[T], preds: List[T]) -> List[float]:
+def batch_exact_match(golds: list[T], preds: list[T]) -> list[float]:
    """
    Calculate exact match accuracy for a batch of sentences.

@@ -128,8 +128,8 @@ def batch_exact_match(golds: List[T], preds: List[T]) -> List[float]:


 def rouge_l(
-    gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = ' '
-) -> Dict[str, float]:
+    gold: Union[str, list[str]], pred: Union[str, list[str]], sep: str = ' '
+) -> dict[str, float]:
    """
    Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.

@@ -152,10 +152,10 @@ def rouge_l(


 def batch_rouge_l(
-    golds: List[Union[str, List[str]]],
-    preds: List[Union[str, List[str]]],
+    golds: list[Union[str, list[str]]],
+    preds: list[Union[str, list[str]]],
    sep: str = ' ',
-) -> Dict[str, List[float]]:
+) -> dict[str, list[float]]:
    """
    Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.

@@ -171,8 +171,8 @@ def batch_rouge_l(


 def accuracy(
-    gold: List[str],
-    pred: List[str],
+    gold: list[str],
+    pred: list[str],
    ignore: Optional[Sequence[str]] = None,
 ) -> float:
    """
@@ -206,10 +206,10 @@ def accuracy(


 def batch_accuracy(
-    golds: List[List[str]],
-    preds: List[List[str]],
+    golds: list[list[str]],
+    preds: list[list[str]],
    ignore: Optional[Sequence[str]] = None,
-) -> List[float]:
+) -> list[float]:
    """
    Calculate token-level accuracy for a batch of sentences.

@@ -224,8 +224,8 @@ def batch_accuracy(


 def first_match_to_topk(
-    first_match_list: List[int], k_values: List[int]
-) -> Dict[int, List[float]]:
+    first_match_list: list[int], k_values: list[int]
+) -> dict[int, list[float]]:
    """
    Calculate top-k accuracy with the first match ranks (1-indexed).

@@ -250,7 +250,7 @@ def pass_at_k(n: int, c: int, k: int) -> float:
        return (1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)).item()) * 100


-def self_bleu(samples: List[List[str]]) -> float:
+def self_bleu(samples: list[list[str]]) -> float:
    """
    Calculate self-BLEU among the samples.
    :param samples: the chosen m samples
@@ -273,7 +273,7 @@ def self_bleu(samples: List[List[str]]) -> float:
    return np.mean(scores).item()


-def self_edit_distance(samples: List[Union[str, List[str]]], sep=' ') -> float:
+def self_edit_distance(samples: list[Union[str, list[str]]], sep=' ') -> float:
    """
    Calculate self-edit-distance among the samples.
    :param samples: the chosen m samples
@@ -299,7 +299,7 @@ def self_edit_distance(samples: List[Union[str, List[str]]], sep=' ') -> float:
    return np.mean(scores).item()


-QUALITY_METRICS: Dict[str, Callable[[List[str], List[str]], float]] = {
+QUALITY_METRICS: dict[str, Callable[[list[str], list[str]], float]] = {
    'bleu': bleu,
    'xmatch': exact_match,
    'edit-sim': edit_sim,
--- a/evaluation/benchmarks/testgeneval/run_infer.py
+++ b/evaluation/benchmarks/testgeneval/run_infer.py
@@ -95,9 +95,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):

    if RUN_WITH_BROWSING:
        instruction += (
-            '<IMPORTANT!>\n'
-            'You SHOULD NEVER attempt to browse the web. '
-            '</IMPORTANT!>\n'
+            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
        )

    return instruction
@@ -243,7 +241,7 @@ def initialize_runtime(

            # Copy the file to the desired location
            action = CmdRunAction(
-                command=f"cp /tmp/test_suite.py /testbed/{instance['test_file']}"
+                command=f'cp /tmp/test_suite.py /testbed/{instance["test_file"]}'
            )
            action.set_hard_timeout(600)
            logger.info(action, extra={'msg_type': 'ACTION'})
--- a/evaluation/benchmarks/testgeneval/scripts/docker/add_testing_dependencies.py
+++ b/evaluation/benchmarks/testgeneval/scripts/docker/add_testing_dependencies.py
@@ -71,9 +71,10 @@ def process_images(dataset, original_namespace, new_namespace, start_instance_id
            patch_file_path = 'patch.diff'
            test_patch_file_path = 'test_patch.diff'

-            with open(patch_file_path, 'w') as patch_file, open(
-                test_patch_file_path, 'w'
-            ) as test_patch_file:
+            with (
+                open(patch_file_path, 'w') as patch_file,
+                open(test_patch_file_path, 'w') as test_patch_file,
+            ):
                patch_file.write(datum['patch'])
                test_patch_file.write(datum['test_patch'])

--- a/evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_output_to_md.py
+++ b/evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_output_to_md.py
@@ -34,17 +34,17 @@ def convert_history_to_str(history):
        if isinstance(event, list):
            # "event" is a legacy pair of (action, observation)
            event_obj = event_from_dict(event[0])
-            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += f'## {i + 1}| {event_obj.__class__.__name__}\n\n'
            ret += str(event_obj)
            ret += separator

            event_obj = event_from_dict(event[1])
-            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += f'## {i + 1}| {event_obj.__class__.__name__}\n\n'
            ret += str(event_obj)
        else:
            # "event" is a single event
            event_obj = event_from_dict(event)
-            ret += f'## {i+1}| {event_obj.__class__.__name__}\n\n'
+            ret += f'## {i + 1}| {event_obj.__class__.__name__}\n\n'
            ret += str(event_obj)
    return ret

--- a/evaluation/benchmarks/testgeneval/test_filter.py
+++ b/evaluation/benchmarks/testgeneval/test_filter.py
@@ -1,6 +1,5 @@
 import ast
 import re
-from typing import List, Tuple

 from evaluation.benchmarks.testgeneval.constants import TestStatus
 from evaluation.benchmarks.testgeneval.log_parsers import (
@@ -37,7 +36,7 @@ def extract_preamble_classes_and_functions(code):

    current_position = 0

-    def extract_class_body(code: str, start_index: int) -> Tuple[str, int]:
+    def extract_class_body(code: str, start_index: int) -> tuple[str, int]:
        """
        Extracts the body of a class from the given code starting from the specified index.
        Returns the class body and the end index of the class body.
@@ -168,7 +167,7 @@ def extract_preamble_classes_and_functions(code):

 def filter_passing_tests(
    test_content: str, test_output: str, repo: str
-) -> Tuple[str, List[str], List[str]]:
+) -> tuple[str, list[str], list[str]]:
    """
    Filter tests based on their execution results.
    Returns:
@@ -246,7 +245,7 @@ def filter_passing_tests(

 def filter_tests(
    test_content: str, test_output: str, repo: str
-) -> Tuple[str, List[str], List[str]]:
+) -> tuple[str, list[str], list[str]]:
    """
    Filter tests using AST parsing to remove failing test functions from the test file.
    Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
--- a/evaluation/benchmarks/testgeneval/utils.py
+++ b/evaluation/benchmarks/testgeneval/utils.py
@@ -24,7 +24,7 @@ def get_test_directives(instance: TestGenEvalInstance) -> list:
        return ['test.py']

    # Get test directives from test patch and remove non-test files
-    directives = [f"/testbed/{instance['test_file']}"]
+    directives = [f'/testbed/{instance["test_file"]}']

    # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
    if instance['repo'] == 'django/django':
@@ -65,8 +65,8 @@ def load_testgeneval_dataset(
        if ids - dataset_ids:
            raise ValueError(
                (
-                    "Some instance IDs not found in dataset!"
-                    f"\nMissing IDs:\n{' '.join(ids - dataset_ids)}"
+                    'Some instance IDs not found in dataset!'
+                    f'\nMissing IDs:\n{" ".join(ids - dataset_ids)}'
                )
            )
        dataset = [instance for instance in dataset if instance['id'] in ids]
--- a/evaluation/benchmarks/the_agent_company/browsing.py
+++ b/evaluation/benchmarks/the_agent_company/browsing.py
@@ -7,7 +7,7 @@ import os
 import re
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Dict, List, Union
+from typing import Union

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import BrowseInteractiveAction
@@ -100,7 +100,7 @@ class ClickAction(BrowserAction):
        return f'click("{self.selector}")'


-def parse_content_to_elements(content: str) -> Dict[str, str]:
+def parse_content_to_elements(content: str) -> dict[str, str]:
    """Parse the observation content into a dictionary mapping anchors to their descriptions"""
    elements = {}
    current_anchor = None
@@ -170,7 +170,7 @@ def resolve_action(action: BrowserAction, content: str) -> BrowserAction:

 def pre_login(
    runtime: Runtime,
-    services: List[str],
+    services: list[str],
    save_screenshots=True,
    screenshots_dir='screenshots',
 ):
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -8,7 +8,6 @@ import json
 import os
 import shutil
 import tempfile
-from typing import List

 import yaml
 from browsing import pre_login
@@ -68,7 +67,7 @@ def get_config(
    return config


-def load_dependencies(runtime: Runtime) -> List[str]:
+def load_dependencies(runtime: Runtime) -> list[str]:
    """
    Every task has a dependencies.yml file, which lists all the services that the
    task depends on. This function loads the file and returns all dependent service names.
@@ -128,7 +127,7 @@ def run_solver(
    runtime: Runtime,
    task_name: str,
    config: AppConfig,
-    dependencies: List[str],
+    dependencies: list[str],
    save_final_state: bool,
    state_dir: str,
    save_screenshots: bool,
--- a/evaluation/benchmarks/the_agent_company/scripts/summarise_results.py
+++ b/evaluation/benchmarks/the_agent_company/scripts/summarise_results.py
@@ -8,7 +8,6 @@ import json
 import os
 import re
 import sys
-from typing import Dict, Tuple


 def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
@@ -60,7 +59,7 @@ def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> fl
        raise ValueError(f'Unknown model: {model}')


-def analyze_eval_json_file(filepath: str) -> Tuple[int, int]:
+def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
    """
    Analyze a single eval JSON file and extract the total and result from final_score.

@@ -84,7 +83,7 @@ def analyze_eval_json_file(filepath: str) -> Tuple[int, int]:
        return (0, 0)


-def analyze_traj_json_file(filepath: str) -> Tuple[int, float]:
+def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
    """
    Analyze a single trajectory JSON file and extract the steps and tokens
    for each step. Then estimate the cost based on the tokens and the model type.
@@ -115,7 +114,7 @@ def analyze_traj_json_file(filepath: str) -> Tuple[int, float]:

 def analyze_folder(
    folder_path: str,
-) -> Tuple[Dict[str, Tuple[int, int]], Dict[str, Tuple[int, float]]]:
+) -> tuple[dict[str, tuple[int, int]], dict[str, tuple[int, float]]]:
    """
    Analyze all eval_*.json & traj_*.json files in the specified folder.

@@ -252,7 +251,7 @@ def main():
    print('\n## Summary\n')
    print(f'**Tasks Evaluated:** {len(eval_results)}\n')
    print(
-        f'**Perfect Completions:** {perfect_completions}/{len(eval_results)} ({(perfect_completions/len(eval_results)*100):.2f}%)\n'
+        f'**Perfect Completions:** {perfect_completions}/{len(eval_results)} ({(perfect_completions / len(eval_results) * 100):.2f}%)\n'
    )

    overall_score = (
@@ -278,10 +277,10 @@ def main():
        print('\n## Statistics\n')
        print('| Metric | Value |')
        print('|---------|--------|')
-        print(f'| Highest Task Score | {highest_score*100:.2f}% |')
-        print(f'| Lowest Task Score | {lowest_score*100:.2f}% |')
-        print(f'| Median Task Score | {median_score*100:.2f}% |')
-        print(f'| Average Task Score | {avg_score*100:.2f}% |')
+        print(f'| Highest Task Score | {highest_score * 100:.2f}% |')
+        print(f'| Lowest Task Score | {lowest_score * 100:.2f}% |')
+        print(f'| Median Task Score | {median_score * 100:.2f}% |')
+        print(f'| Average Task Score | {avg_score * 100:.2f}% |')

        # compute avg score per nature category
        print('\n## Statistics per Nature Category\n')
@@ -307,9 +306,11 @@ def main():
                if nature_category == task_nature and is_perfect
            )
            print(
-                f'| Perfect Completions for {task_nature} | {perfect_completions}/{num_of_tasks} ({perfect_completions/num_of_tasks*100:.2f}%) |'
+                f'| Perfect Completions for {task_nature} | {perfect_completions}/{num_of_tasks} ({perfect_completions / num_of_tasks * 100:.2f}%) |'
+            )
+            print(
+                f'| Average Score for {task_nature} | {task_nature_score * 100:.2f}% |'
            )
-            print(f'| Average Score for {task_nature} | {task_nature_score*100:.2f}% |')


 if __name__ == '__main__':
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -64,7 +64,7 @@ def initialize_runtime(runtime: Runtime):

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -80,7 +80,7 @@ def initialize_runtime(runtime: Runtime):

    runtime.add_env_vars({'WOLFRAM_ALPHA_APPID': args.wolfram_alpha_appid})

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')


 def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -100,7 +100,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -116,7 +116,7 @@ def initialize_runtime(
    goal_image_urls = []
    if hasattr(obs, 'goal_image_urls'):
        goal_image_urls = obs.goal_image_urls
-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
    return goal, goal_image_urls


@@ -129,7 +129,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
@@ -137,7 +137,7 @@ def complete_runtime(
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return {
        'rewards': json.loads(obs.content),
    }
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -87,7 +87,7 @@ def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
    obs: CmdOutputObservation

    # Set instance id
@@ -102,7 +102,7 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    goal = obs.content

-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
    return goal


@@ -115,7 +115,7 @@ def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} BEGIN Runtime Completion Fn {"-" * 50}')
    obs: CmdOutputObservation

    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
@@ -123,7 +123,7 @@ def complete_runtime(
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f'{"-" * 50} END Runtime Completion Fn {"-" * 50}')
    return {
        'rewards': json.loads(obs.content),
    }
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -93,14 +93,14 @@ def process_instance(
    spec = importlib.util.spec_from_file_location(instance_id, instance.file_path)
    test_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(test_module)
-    assert hasattr(
-        test_module, 'Test'
-    ), f'Test module {instance_id} does not have a Test class'
+    assert hasattr(test_module, 'Test'), (
+        f'Test module {instance_id} does not have a Test class'
+    )

    test_class: type[BaseIntegrationTest] = test_module.Test
-    assert issubclass(
-        test_class, BaseIntegrationTest
-    ), f'Test class {instance_id} does not inherit from BaseIntegrationTest'
+    assert issubclass(test_class, BaseIntegrationTest), (
+        f'Test class {instance_id} does not inherit from BaseIntegrationTest'
+    )

    instruction = test_class.INSTRUCTION

--- a/evaluation/regression/conftest.py
+++ b/evaluation/regression/conftest.py
@@ -132,7 +132,7 @@ def run_test_case(test_cases_dir, workspace_dir, request):
                'python3',
                f'{SCRIPT_DIR}/../../openhands/main.py',
                '-d',
-                f"{os.path.join(agent_dir, 'workspace')}",
+                f'{os.path.join(agent_dir, "workspace")}',
                '-c',
                f'{agents_ref[agent]}',
                '-t',
@@ -165,7 +165,7 @@ def pytest_configure(config):
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
-            logging.FileHandler(f"test_results_{now.strftime('%Y%m%d_%H%M%S')}.log"),
+            logging.FileHandler(f'test_results_{now.strftime("%Y%m%d_%H%M%S")}.log'),
            logging.StreamHandler(),
        ],
    )
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -221,9 +221,9 @@ def prepare_dataset(
    eval_ids: list[str] | None = None,
    skip_num: int | None = None,
 ):
-    assert (
-        'instance_id' in dataset.columns
-    ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
+    assert 'instance_id' in dataset.columns, (
+        "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
+    )
    id_column = 'instance_id'
    logger.info(f'Writing evaluation output to {output_file}')
    finished_ids: set[str] = set()
--- a/openhands/agenthub/codeact_agent/tools/bash.py
+++ b/openhands/agenthub/codeact_agent/tools/bash.py
@@ -39,7 +39,9 @@ def refine_prompt(prompt: str):
 def create_cmd_run_tool(
    use_short_description: bool = False,
 ) -> ChatCompletionToolParam:
-    description = _SHORT_BASH_DESCRIPTION if use_short_description else _DETAILED_BASH_DESCRIPTION
+    description = (
+        _SHORT_BASH_DESCRIPTION if use_short_description else _DETAILED_BASH_DESCRIPTION
+    )
    return ChatCompletionToolParam(
        type='function',
        function=ChatCompletionToolParamFunctionChunk(
--- a/openhands/agenthub/codeact_agent/tools/browser.py
+++ b/openhands/agenthub/codeact_agent/tools/browser.py
@@ -131,12 +131,12 @@ upload_file(bid: str, file: str | list[str])


 for _, action in _browser_action_space.action_set.items():
-    assert (
-        action.signature in _BROWSER_TOOL_DESCRIPTION
-    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
-    assert (
-        action.description in _BROWSER_TOOL_DESCRIPTION
-    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
+    assert action.signature in _BROWSER_TOOL_DESCRIPTION, (
+        f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
+    )
+    assert action.description in _BROWSER_TOOL_DESCRIPTION, (
+        f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
+    )

 BrowserTool = ChatCompletionToolParam(
    type='function',
--- a/openhands/agenthub/readonly_agent/readonly_agent.py
+++ b/openhands/agenthub/readonly_agent/readonly_agent.py
@@ -52,7 +52,7 @@ class ReadOnlyAgent(CodeActAgent):
        super().__init__(llm, config)

        logger.debug(
-            f"TOOLS loaded for ReadOnlyAgent: {', '.join([tool.get('function').get('name') for tool in self.tools])}"
+            f'TOOLS loaded for ReadOnlyAgent: {", ".join([tool.get("function").get("name") for tool in self.tools])}'
        )

    @property
--- a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
+++ b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
@@ -42,7 +42,7 @@ Review the current state of the page and all other information to find the best
    goal_image_urls = []
    if image_urls is not None:
        for idx, url in enumerate(image_urls):
-            goal_txt = goal_txt + f'Images: Goal input image ({idx+1})\n'
+            goal_txt = goal_txt + f'Images: Goal input image ({idx + 1})\n'
            goal_image_urls.append(url)
    goal_txt += '\n'
    return goal_txt, goal_image_urls
@@ -111,7 +111,7 @@ Note: This action set allows you to interact with your environment. Most of them
 def get_history_prompt(prev_actions: list[BrowseInteractiveAction]) -> str:
    history_prompt = ['# History of all previous interactions with the task:\n']
    for i in range(len(prev_actions)):
-        history_prompt.append(f'## step {i+1}')
+        history_prompt.append(f'## step {i + 1}')
        history_prompt.append(
            f'\nOuput thought and action: {prev_actions[i].thought} ```{prev_actions[i].browser_actions}```\n'
        )
--- a/openhands/cli/main.py
+++ b/openhands/cli/main.py
@@ -288,7 +288,12 @@ async def main(loop: asyncio.AbstractEventLoop):

    # Use settings from settings store if available and override with command line arguments
    if settings:
-        config.default_agent = args.agent_cls if args.agent_cls else settings.agent
+        if args.agent_cls:
+            config.default_agent = str(args.agent_cls)
+        else:
+            # settings.agent is not None because we check for it in setup_config_from_args
+            assert settings.agent is not None
+            config.default_agent = settings.agent
        if not args.llm_config and settings.llm_model and settings.llm_api_key:
            llm_config = config.get_llm_config()
            llm_config.model = settings.llm_model
--- a/openhands/cli/settings.py
+++ b/openhands/cli/settings.py
@@ -84,7 +84,7 @@ def display_settings(config: AppConfig):

    # Construct the summary text with aligned columns
    settings_lines = [
-        f'{label+":":<{max_label_width+1}} {value:<}'  # Changed value alignment to left (<)
+        f'{label + ":":<{max_label_width + 1}} {value:<}'  # Changed value alignment to left (<)
        for label, value in str_labels_and_values
    ]
    settings_text = '\n'.join(settings_lines)
--- a/openhands/cli/tui.py
+++ b/openhands/cli/tui.py
@@ -549,7 +549,7 @@ def cli_confirm(
        ] + [
            (
                'class:selected' if i == selected[0] else 'class:unselected',
-                f"{'> ' if i == selected[0] else '  '}{choice}\n",
+                f'{"> " if i == selected[0] else "  "}{choice}\n',
            )
            for i, choice in enumerate(choices)
        ]
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -167,17 +167,17 @@ class Agent(ABC):
        - mcp_tools (list[dict]): The list of MCP tools.
        """
        logger.info(
-            f"Setting {len(mcp_tools)} MCP tools for agent {self.name}: {[tool['function']['name'] for tool in mcp_tools]}"
+            f'Setting {len(mcp_tools)} MCP tools for agent {self.name}: {[tool["function"]["name"] for tool in mcp_tools]}'
        )
        for tool in mcp_tools:
            _tool = ChatCompletionToolParam(**tool)
            if _tool['function']['name'] in self.mcp_tools:
                logger.warning(
-                    f"Tool {_tool['function']['name']} already exists, skipping"
+                    f'Tool {_tool["function"]["name"]} already exists, skipping'
                )
                continue
            self.mcp_tools[_tool['function']['name']] = _tool
            self.tools.append(_tool)
        logger.info(
-            f"Tools updated for agent {self.name}, total {len(self.tools)}: {[tool['function']['name'] for tool in self.tools]}"
+            f'Tools updated for agent {self.name}, total {len(self.tools)}: {[tool["function"]["name"] for tool in self.tools]}'
        )
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -220,7 +220,7 @@ class State:
            'trace_version': openhands.__version__,
            'tags': [
                f'agent:{agent_name}',
-                f"web_host:{os.environ.get('WEB_HOST', 'unspecified')}",
+                f'web_host:{os.environ.get("WEB_HOST", "unspecified")}',
                f'openhands_version:{openhands.__version__}',
            ],
        }
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -142,9 +142,9 @@ async def run_controller(
        agent, runtime, config, replay_events=replay_events
    )

-    assert isinstance(
-        initial_user_action, Action
-    ), f'initial user actions must be an Action, got {type(initial_user_action)}'
+    assert isinstance(initial_user_action, Action), (
+        f'initial user actions must be an Action, got {type(initial_user_action)}'
+    )
    logger.debug(
        f'Agent Controller Initialized: Running agent {agent.name}, model '
        f'{agent.llm.config.model}, with actions: {initial_user_action}'
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -149,9 +149,9 @@ class Message(BaseModel):

        # an observation message with tool response
        if self.tool_call_id is not None:
-            assert (
-                self.name is not None
-            ), 'name is required when tool_call_id is not None'
+            assert self.name is not None, (
+                'name is required when tool_call_id is not None'
+            )
            message_dict['tool_call_id'] = self.tool_call_id
            message_dict['name'] = self.name

--- a/openhands/events/action/browse.py
+++ b/openhands/events/action/browse.py
@@ -36,9 +36,7 @@ class BrowseInteractiveAction(Action):

    @property
    def message(self) -> str:
-        return (
-            f'I am interacting with the browser:\n' f'```\n{self.browser_actions}\n```'
-        )
+        return f'I am interacting with the browser:\n```\n{self.browser_actions}\n```'

    def __str__(self) -> str:
        ret = '**BrowseInteractiveAction**\n'
--- a/openhands/events/observation/files.py
+++ b/openhands/events/observation/files.py
@@ -115,13 +115,13 @@ class FileEditObservation(Observation):
                    for idx, line in enumerate(old_lines[i1:i2]):
                        line_num = i1 + idx + 1
                        cur_group['before_edits'].append(
-                            f'-{line_num:>{_indent_pad_size-1}}|{line}'
+                            f'-{line_num:>{_indent_pad_size - 1}}|{line}'
                        )
                if tag in {'replace', 'insert'}:
                    for idx, line in enumerate(new_lines[j1:j2]):
                        line_num = j1 + idx + 1
                        cur_group['after_edits'].append(
-                            f'+{line_num:>{_indent_pad_size-1}}|{line}'
+                            f'+{line_num:>{_indent_pad_size - 1}}|{line}'
                        )
            edit_groups.append(cur_group)
        return edit_groups
@@ -169,12 +169,12 @@ class FileEditObservation(Observation):
        for i, cur_edit_group in enumerate(edit_groups):
            if i != 0:
                result.append('-------------------------')
-            result.append(f'[begin of {op_type} {i+1} / {len(edit_groups)}]')
+            result.append(f'[begin of {op_type} {i + 1} / {len(edit_groups)}]')
            result.append(f'(content before {op_type})')
            result.extend(cur_edit_group['before_edits'])
            result.append(f'(content after {op_type})')
            result.extend(cur_edit_group['after_edits'])
-            result.append(f'[end of {op_type} {i+1} / {len(edit_groups)}]')
+            result.append(f'[end of {op_type} {i + 1} / {len(edit_groups)}]')

        # Cache the result
        self._diff_cache = '\n'.join(result)
@@ -186,9 +186,9 @@ class FileEditObservation(Observation):
            return self.content

        if not self.prev_exist:
-            assert (
-                self.old_content == ''
-            ), 'old_content should be empty if the file is new (prev_exist=False).'
+            assert self.old_content == '', (
+                'old_content should be empty if the file is new (prev_exist=False).'
+            )
            return f'[New file {self.path} is created with the provided content.]\n'

        # Use cached diff if available, otherwise compute it
--- a/openhands/integrations/github/github_service.py
+++ b/openhands/integrations/github/github_service.py
@@ -277,7 +277,7 @@ class GitHubService(BaseGitService, GitService):
                result = response.json()
                if 'errors' in result:
                    raise UnknownException(
-                        f"GraphQL query error: {json.dumps(result['errors'])}"
+                        f'GraphQL query error: {json.dumps(result["errors"])}'
                    )

                return dict(result)
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -253,12 +253,12 @@ def convert_tool_call_to_string(tool_call: dict) -> str:
    if tool_call['type'] != 'function':
        raise FunctionCallConversionError("Tool call type must be 'function'.")

-    ret = f"<function={tool_call['function']['name']}>\n"
+    ret = f'<function={tool_call["function"]["name"]}>\n'
    try:
        args = json.loads(tool_call['function']['arguments'])
    except json.JSONDecodeError as e:
        raise FunctionCallConversionError(
-            f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
+            f'Failed to parse arguments as JSON. Arguments: {tool_call["function"]["arguments"]}'
        ) from e
    for param_name, param_value in args.items():
        is_multiline = isinstance(param_value, str) and '\n' in param_value
@@ -280,8 +280,8 @@ def convert_tools_to_description(tools: list[dict]) -> str:
        fn = tool['function']
        if i > 0:
            ret += '\n'
-        ret += f"---- BEGIN FUNCTION #{i+1}: {fn['name']} ----\n"
-        ret += f"Description: {fn['description']}\n"
+        ret += f'---- BEGIN FUNCTION #{i + 1}: {fn["name"]} ----\n'
+        ret += f'Description: {fn["description"]}\n'

        if 'parameters' in fn:
            ret += 'Parameters:\n'
@@ -303,12 +303,12 @@ def convert_tools_to_description(tools: list[dict]) -> str:
                    desc += f'\nAllowed values: [{enum_values}]'

                ret += (
-                    f'  ({j+1}) {param_name} ({param_type}, {param_status}): {desc}\n'
+                    f'  ({j + 1}) {param_name} ({param_type}, {param_status}): {desc}\n'
                )
        else:
            ret += 'No parameters are required for this function.\n'

-        ret += f'---- END FUNCTION #{i+1} ----\n'
+        ret += f'---- END FUNCTION #{i + 1} ----\n'
    return ret


@@ -667,7 +667,7 @@ def convert_non_fncall_messages_to_fncall_messages(
                        'content': [{'type': 'text', 'text': tool_result}]
                        if isinstance(content, list)
                        else tool_result,
-                        'tool_call_id': f'toolu_{tool_call_counter-1:02d}',  # Use last generated ID
+                        'tool_call_id': f'toolu_{tool_call_counter - 1:02d}',  # Use last generated ID
                    }
                )
            else:
@@ -790,14 +790,14 @@ def convert_from_multiple_tool_calls_to_single_tool_call_messages(
                # add the tool result
                converted_messages.append(message)
            else:
-                assert (
-                    len(pending_tool_calls) == 0
-                ), f'Found pending tool calls but not found in pending list: {pending_tool_calls=}'
+                assert len(pending_tool_calls) == 0, (
+                    f'Found pending tool calls but not found in pending list: {pending_tool_calls=}'
+                )
                converted_messages.append(message)
        else:
-            assert (
-                len(pending_tool_calls) == 0
-            ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
+            assert len(pending_tool_calls) == 0, (
+                f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
+            )
            converted_messages.append(message)

    if not ignore_final_tool_result and len(pending_tool_calls) > 0:
--- a/openhands/mcp/utils.py
+++ b/openhands/mcp/utils.py
@@ -158,12 +158,12 @@ async def add_mcp_tools_to_agent(
        ActionExecutionClient,  # inline import to avoid circular import
    )

-    assert isinstance(
-        runtime, ActionExecutionClient
-    ), 'Runtime must be an instance of ActionExecutionClient'
-    assert (
-        runtime.runtime_initialized
-    ), 'Runtime must be initialized before adding MCP tools'
+    assert isinstance(runtime, ActionExecutionClient), (
+        'Runtime must be an instance of ActionExecutionClient'
+    )
+    assert runtime.runtime_initialized, (
+        'Runtime must be initialized before adding MCP tools'
+    )

    # Add the runtime as another MCP server
    updated_mcp_config = runtime.get_updated_mcp_config()
@@ -171,7 +171,7 @@ async def add_mcp_tools_to_agent(
    mcp_tools = await fetch_mcp_tools_from_config(updated_mcp_config)

    logger.info(
-        f"Loaded {len(mcp_tools)} MCP tools: {[tool['function']['name'] for tool in mcp_tools]}"
+        f'Loaded {len(mcp_tools)} MCP tools: {[tool["function"]["name"] for tool in mcp_tools]}'
    )

    # Set the MCP tools on the agent
--- a/openhands/resolver/interfaces/gitlab.py
+++ b/openhands/resolver/interfaces/gitlab.py
@@ -214,7 +214,7 @@ class GitlabIssueHandler(IssueHandlerInterface):

    def reply_to_comment(self, pr_number: int, comment_id: str, reply: str) -> None:
        response = httpx.get(
-            f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split('/')[-1]}',
+            f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split("/")[-1]}',
            headers=self.headers,
        )
        response.raise_for_status()
@@ -225,7 +225,7 @@ class GitlabIssueHandler(IssueHandlerInterface):
                'note_id': discussions.get('notes', [])[-1]['id'],
            }
            response = httpx.post(
-                f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split('/')[-1]}/notes',
+                f'{self.base_url}/merge_requests/{pr_number}/discussions/{comment_id.split("/")[-1]}/notes',
                headers=self.headers,
                json=data,
            )
--- a/openhands/runtime/builder/remote.py
+++ b/openhands/runtime/builder/remote.py
@@ -99,7 +99,7 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
            logger.info(f'Build status: {status}')

            if status == 'SUCCESS':
-                logger.debug(f"Successfully built {status_data['image']}")
+                logger.debug(f'Successfully built {status_data["image"]}')
                return str(status_data['image'])
            elif status in [
                'FAILURE',
@@ -139,9 +139,9 @@ class RemoteRuntimeBuilder(RuntimeBuilder):

        if result['exists']:
            logger.debug(
-                f"Image {image_name} exists. "
-                f"Uploaded at: {result['image']['upload_time']}, "
-                f"Size: {result['image']['image_size_bytes'] / 1024 / 1024:.2f} MB"
+                f'Image {image_name} exists. '
+                f'Uploaded at: {result["image"]["upload_time"]}, '
+                f'Size: {result["image"]["image_size_bytes"] / 1024 / 1024:.2f} MB'
            )
        else:
            logger.debug(f'Image {image_name} does not exist.')
--- a/openhands/runtime/impl/daytona/daytona_runtime.py
+++ b/openhands/runtime/impl/daytona/daytona_runtime.py
@@ -115,12 +115,12 @@ class DaytonaRuntime(ActionExecutionClient):

    def _construct_api_url(self, port: int) -> str:
        assert self.workspace is not None, 'Workspace is not initialized'
-        assert (
-            self.workspace.instance.info is not None
-        ), 'Workspace info is not available'
-        assert (
-            self.workspace.instance.info.provider_metadata is not None
-        ), 'Provider metadata is not available'
+        assert self.workspace.instance.info is not None, (
+            'Workspace info is not available'
+        )
+        assert self.workspace.instance.info.provider_metadata is not None, (
+            'Provider metadata is not available'
+        )

        node_domain = json.loads(self.workspace.instance.info.provider_metadata)[
            'nodeDomain'
--- a/openhands/runtime/impl/e2b/sandbox.py
+++ b/openhands/runtime/impl/e2b/sandbox.py
@@ -40,9 +40,9 @@ class E2BBox:

    def _archive(self, host_src: str, recursive: bool = False):
        if recursive:
-            assert os.path.isdir(
-                host_src
-            ), 'Source must be a directory when recursive is True'
+            assert os.path.isdir(host_src), (
+                'Source must be a directory when recursive is True'
+            )
            files = glob(host_src + '/**/*', recursive=True)
            srcname = os.path.basename(host_src)
            tar_filename = os.path.join(os.path.dirname(host_src), srcname + '.tar')
@@ -52,9 +52,9 @@ class E2BBox:
                        file, arcname=os.path.relpath(file, os.path.dirname(host_src))
                    )
        else:
-            assert os.path.isfile(
-                host_src
-            ), 'Source must be a file when recursive is False'
+            assert os.path.isfile(host_src), (
+                'Source must be a file when recursive is False'
+            )
            srcname = os.path.basename(host_src)
            tar_filename = os.path.join(os.path.dirname(host_src), srcname + '.tar')
            with tarfile.open(tar_filename, mode='w') as tar:
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -130,12 +130,12 @@ class RemoteRuntime(ActionExecutionClient):
                )
                self.container_image = self.config.sandbox.runtime_container_image
            self._start_runtime()
-        assert (
-            self.runtime_id is not None
-        ), 'Runtime ID is not set. This should never happen.'
-        assert (
-            self.runtime_url is not None
-        ), 'Runtime URL is not set. This should never happen.'
+        assert self.runtime_id is not None, (
+            'Runtime ID is not set. This should never happen.'
+        )
+        assert self.runtime_url is not None, (
+            'Runtime URL is not set. This should never happen.'
+        )
        self.send_status_message('STATUS$WAITING_FOR_CLIENT')
        if not self.attach_to_existing:
            self.log('info', 'Waiting for runtime to be alive...')
--- a/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
+++ b/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
@@ -157,7 +157,7 @@ def _print_window(
        else:
            output += '(this is the beginning of the file)\n'
        for i in range(start, end + 1):
-            _new_line = f'{i}|{lines[i-1]}'
+            _new_line = f'{i}|{lines[i - 1]}'
            if not _new_line.endswith('\n'):
                _new_line += '\n'
            output += _new_line
--- a/openhands/runtime/plugins/jupyter/execute_server.py
+++ b/openhands/runtime/plugins/jupyter/execute_server.py
@@ -189,7 +189,7 @@ class JupyterKernel:

                if os.environ.get('DEBUG'):
                    logging.info(
-                        f"MSG TYPE: {msg_type.upper()} DONE:{execution_done}\nCONTENT: {msg_dict['content']}"
+                        f'MSG TYPE: {msg_type.upper()} DONE:{execution_done}\nCONTENT: {msg_dict["content"]}'
                    )

                if msg_type == 'error':
@@ -203,7 +203,7 @@ class JupyterKernel:
                    if 'image/png' in msg_dict['content']['data']:
                        # use markdone to display image (in case of large image)
                        outputs.append(
-                            f"\n![image](data:image/png;base64,{msg_dict['content']['data']['image/png']})\n"
+                            f'\n![image](data:image/png;base64,{msg_dict["content"]["data"]["image/png"]})\n'
                        )

                elif msg_type == 'execute_reply':
@@ -272,7 +272,7 @@ class ExecuteHandler(tornado.web.RequestHandler):

 def make_app() -> tornado.web.Application:
    jupyter_kernel = JupyterKernel(
-        f"localhost:{os.environ.get('JUPYTER_GATEWAY_PORT', '8888')}",
+        f'localhost:{os.environ.get("JUPYTER_GATEWAY_PORT", "8888")}',
        os.environ.get('JUPYTER_GATEWAY_KERNEL_ID', 'default'),
    )
    asyncio.get_event_loop().run_until_complete(jupyter_kernel.initialize())
--- a/openhands/runtime/utils/bash.py
+++ b/openhands/runtime/utils/bash.py
@@ -501,9 +501,9 @@ class BashSession:
        if len(splited_commands) > 1:
            return ErrorObservation(
                content=(
-                    f"ERROR: Cannot execute multiple commands at once.\n"
-                    f"Please run each command separately OR chain them into a single command via && or ;\n"
-                    f"Provided commands:\n{'\n'.join(f'({i + 1}) {cmd}' for i, cmd in enumerate(splited_commands))}"
+                    f'ERROR: Cannot execute multiple commands at once.\n'
+                    f'Please run each command separately OR chain them into a single command via && or ;\n'
+                    f'Provided commands:\n{"\n".join(f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_commands))}'
                )
            )

@@ -591,8 +591,8 @@ class BashSession:
            logger.debug(
                f'PANE CONTENT GOT after {time.time() - _start_time:.2f} seconds'
            )
-            logger.debug(f"BEGIN OF PANE CONTENT: {cur_pane_output.split('\n')[:10]}")
-            logger.debug(f"END OF PANE CONTENT: {cur_pane_output.split('\n')[-10:]}")
+            logger.debug(f'BEGIN OF PANE CONTENT: {cur_pane_output.split("\n")[:10]}')
+            logger.debug(f'END OF PANE CONTENT: {cur_pane_output.split("\n")[-10:]}')
            ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_pane_output)
            current_ps1_count = len(ps1_matches)

--- a/openhands/runtime/utils/file_viewer.py
+++ b/openhands/runtime/utils/file_viewer.py
@@ -35,8 +35,8 @@ def generate_file_viewer_html(file_path: str) -> str:
    # Check if the file extension is supported
    if file_extension not in supported_extensions:
        raise ValueError(
-            f"Unsupported file extension: {file_extension}. "
-            f"Supported extensions are: {', '.join(supported_extensions)}"
+            f'Unsupported file extension: {file_extension}. '
+            f'Supported extensions are: {", ".join(supported_extensions)}'
        )

    # Check if the file exists
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@@ -385,9 +385,9 @@ if __name__ == '__main__':
        # and create a Dockerfile dynamically and place it in the build_folder only. This allows the Docker image to
        # then be created using the Dockerfile (most likely using the containers/build.sh script)
        build_folder = args.build_folder
-        assert os.path.exists(
-            build_folder
-        ), f'Build folder {build_folder} does not exist'
+        assert os.path.exists(build_folder), (
+            f'Build folder {build_folder} does not exist'
+        )
        logger.debug(
            f'Copying the source code and generating the Dockerfile in the build folder: {build_folder}'
        )
--- a/openhands/runtime/utils/windows_bash.py
+++ b/openhands/runtime/utils/windows_bash.py
@@ -926,7 +926,7 @@ class WindowsPowershellSession:
                    content=(
                        f'ERROR: Cannot execute multiple commands at once.\n'
                        f'Please run each command separately OR chain them into a single command via PowerShell operators (e.g., ; or |).\n'
-                        f'Detected commands:\n{"\n".join(f"({i+1}) {cmd}" for i, cmd in enumerate(splited_cmds))}'
+                        f'Detected commands:\n{"\n".join(f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_cmds))}'
                    )
                )
            elif statements.Count == 0 and not command.strip().startswith('#'):
--- a/openhands/security/invariant/analyzer.py
+++ b/openhands/security/invariant/analyzer.py
@@ -176,9 +176,9 @@ class InvariantAnalyzer(SecurityAnalyzer):
                    ],
                )
            )
-            assert (
-                self.guardrail_llm is not None
-            ), 'InvariantAnalyzer.guardrail_llm should be initialized before calling check_usertask'
+            assert self.guardrail_llm is not None, (
+                'InvariantAnalyzer.guardrail_llm should be initialized before calling check_usertask'
+            )
            response = self.guardrail_llm.completion(
                messages=self.guardrail_llm.format_messages_for_llm(messages),
                stop=['.'],
@@ -261,9 +261,9 @@ class InvariantAnalyzer(SecurityAnalyzer):
                            ],
                        )
                    )
-                    assert (
-                        self.guardrail_llm is not None
-                    ), 'InvariantAnalyzer.guardrail_llm should be initialized before calling check_fillaction'
+                    assert self.guardrail_llm is not None, (
+                        'InvariantAnalyzer.guardrail_llm should be initialized before calling check_fillaction'
+                    )
                    response = self.guardrail_llm.completion(
                        messages=self.guardrail_llm.format_messages_for_llm(messages),
                        stop=['.'],
--- a/openhands/security/invariant/parser.py
+++ b/openhands/security/invariant/parser.py
@@ -20,7 +20,7 @@ TraceElement = Message | ToolCall | ToolOutput | Function


 def get_next_id(trace: list[TraceElement]) -> str:
-    used_ids = [el.id for el in trace if type(el) == ToolCall]
+    used_ids = [el.id for el in trace if isinstance(el, ToolCall)]
    for i in range(1, len(used_ids) + 2):
        if str(i) not in used_ids:
            return str(i)
@@ -31,7 +31,7 @@ def get_last_id(
    trace: list[TraceElement],
 ) -> str | None:
    for el in reversed(trace):
-        if type(el) == ToolCall:
+        if isinstance(el, ToolCall):
            return el.id
    return None

@@ -39,12 +39,12 @@ def get_last_id(
 def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement]:
    next_id = get_next_id(trace)
    inv_trace: list[TraceElement] = []
-    if type(action) == MessageAction:
+    if isinstance(action, MessageAction):
        if action.source == EventSource.USER:
            inv_trace.append(Message(role='user', content=action.content))
        else:
            inv_trace.append(Message(role='assistant', content=action.content))
-    elif type(action) in [NullAction, ChangeAgentStateAction]:
+    elif isinstance(action, (NullAction, ChangeAgentStateAction)):
        pass
    elif hasattr(action, 'action') and action.action is not None:
        event_dict = event_to_dict(action)
@@ -63,7 +63,7 @@ def parse_observation(
    trace: list[TraceElement], obs: Observation
 ) -> list[TraceElement]:
    last_id = get_last_id(trace)
-    if type(obs) in [NullObservation, AgentStateChangedObservation]:
+    if isinstance(obs, (NullObservation, AgentStateChangedObservation)):
        return []
    elif hasattr(obs, 'content') and obs.content is not None:
        return [ToolOutput(role='tool', content=obs.content, tool_call_id=last_id)]
--- a/openhands/server/conversation_manager/standalone_conversation_manager.py
+++ b/openhands/server/conversation_manager/standalone_conversation_manager.py
@@ -7,14 +7,12 @@ from typing import Callable, Iterable
 import socketio

 from openhands.core.config.app_config import AppConfig
-from openhands.core.config.llm_config import LLMConfig
 from openhands.core.exceptions import AgentRuntimeUnavailableError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema.agent import AgentState
 from openhands.events.action import MessageAction
-from openhands.events.event import EventSource
 from openhands.events.event_store import EventStore
-from openhands.events.stream import EventStream, EventStreamSubscriber, session_exists
+from openhands.events.stream import EventStreamSubscriber, session_exists
 from openhands.server.config.server_config import ServerConfig
 from openhands.server.monitoring import MonitoringListener
 from openhands.server.session.agent_session import WAIT_TIME_BEFORE_CLOSE
@@ -25,7 +23,10 @@ from openhands.storage.data_models.conversation_metadata import ConversationMeta
 from openhands.storage.data_models.settings import Settings
 from openhands.storage.files import FileStore
 from openhands.utils.async_utils import GENERAL_TIMEOUT, call_async_from_sync, wait_all
-from openhands.utils.conversation_summary import get_default_conversation_title, auto_generate_title
+from openhands.utils.conversation_summary import (
+    auto_generate_title,
+    get_default_conversation_title,
+)
 from openhands.utils.import_utils import get_impl
 from openhands.utils.shutdown_listener import should_continue

@@ -208,7 +209,6 @@ class StandaloneConversationManager(ConversationManager):
        store = await conversation_store_class.get_instance(self.config, user_id)
        return store

-
    async def get_running_agent_loops(
        self, user_id: str | None = None, filter_to_sids: set[str] | None = None
    ) -> set[str]:
@@ -287,7 +287,7 @@ class StandaloneConversationManager(ConversationManager):
        response_ids = await self.get_running_agent_loops(user_id)
        if len(response_ids) >= self.config.max_concurrent_conversations:
            logger.info(
-                f'too_many_sessions_for:{user_id or ''}',
+                f'too_many_sessions_for:{user_id or ""}',
                extra={'session_id': sid, 'user_id': user_id},
            )
            # Get the conversations sorted (oldest first)
@@ -300,7 +300,7 @@ class StandaloneConversationManager(ConversationManager):
            while len(conversations) >= self.config.max_concurrent_conversations:
                oldest_conversation_id = conversations.pop().conversation_id
                logger.debug(
-                    f'closing_from_too_many_sessions:{user_id or ''}:{oldest_conversation_id}',
+                    f'closing_from_too_many_sessions:{user_id or ""}:{oldest_conversation_id}',
                    extra={'session_id': oldest_conversation_id, 'user_id': user_id},
                )
                # Send status message to client and close session.
@@ -332,7 +332,9 @@ class StandaloneConversationManager(ConversationManager):
        try:
            session.agent_session.event_stream.subscribe(
                EventStreamSubscriber.SERVER,
-                self._create_conversation_update_callback(user_id, github_user_id, sid, settings),
+                self._create_conversation_update_callback(
+                    user_id, github_user_id, sid, settings
+                ),
                UPDATED_AT_CALLBACK_ID,
            )
        except ValueError:
@@ -429,7 +431,11 @@ class StandaloneConversationManager(ConversationManager):
        )

    def _create_conversation_update_callback(
-        self, user_id: str | None, github_user_id: str | None, conversation_id: str, settings: Settings
+        self,
+        user_id: str | None,
+        github_user_id: str | None,
+        conversation_id: str,
+        settings: Settings,
    ) -> Callable:
        def callback(event, *args, **kwargs):
            call_async_from_sync(
@@ -444,9 +450,13 @@ class StandaloneConversationManager(ConversationManager):

        return callback

-
    async def _update_conversation_for_event(
-        self, user_id: str, github_user_id: str, conversation_id: str, settings: Settings, event=None
+        self,
+        user_id: str,
+        github_user_id: str,
+        conversation_id: str,
+        settings: Settings,
+        event=None,
    ):
        conversation_store = await self._get_conversation_store(user_id, github_user_id)
        conversation = await conversation_store.get_metadata(conversation_id)
@@ -469,8 +479,12 @@ class StandaloneConversationManager(ConversationManager):
                    token_usage.prompt_tokens + token_usage.completion_tokens
                )
        default_title = get_default_conversation_title(conversation_id)
-        if conversation.title == default_title: # attempt to autogenerate if default title is in use
-            title = await auto_generate_title(conversation_id, user_id, self.file_store, settings)
+        if (
+            conversation.title == default_title
+        ):  # attempt to autogenerate if default title is in use
+            title = await auto_generate_title(
+                conversation_id, user_id, self.file_store, settings
+            )
            if title and not title.isspace():
                conversation.title = title
                try:
--- a/openhands/server/data_models/feedback.py
+++ b/openhands/server/data_models/feedback.py
@@ -27,7 +27,7 @@ def store_feedback(feedback: FeedbackDataModel) -> dict[str, str]:
    display_feedback = feedback.model_dump()
    if 'trajectory' in display_feedback:
        display_feedback['trajectory'] = (
-            f"elided [length: {len(display_feedback['trajectory'])}"
+            f'elided [length: {len(display_feedback["trajectory"])}'
        )
    if 'token' in display_feedback:
        display_feedback['token'] = 'elided'
--- a/openhands/server/routes/manage_conversations.py
+++ b/openhands/server/routes/manage_conversations.py
@@ -1,4 +1,3 @@
-import asyncio
 import uuid
 from datetime import datetime, timezone
 from typing import Any
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,21 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = [
+  "poetry-core",
+]
+
 [tool.poetry]
 name = "openhands-ai"
 version = "0.37.0"
 description = "OpenHands: Code Less, Make More"
-authors = ["OpenHands"]
+authors = [ "OpenHands" ]
 license = "MIT"
 readme = "README.md"
 repository = "https://github.com/All-Hands-AI/OpenHands"
 packages = [
  { include = "openhands/**/*" },
  { include = "pyproject.toml", to = "openhands" },
-  { include = "poetry.lock", to = "openhands" }
+  { include = "poetry.lock", to = "openhands" },
 ]

 [tool.poetry.dependencies]
@@ -40,7 +46,7 @@ tenacity = ">=8.5,<10.0"
 zope-interface = "7.2"
 pathspec = "^0.12.1"
 google-cloud-aiplatform = "*"
-anthropic = {extras = ["vertex"], version = "*"}
+anthropic = { extras = [ "vertex" ], version = "*" }
 tree-sitter = "^0.24.0"
 bashlex = "^0.18"
 pyjwt = "^2.9.0"
@@ -97,39 +103,12 @@ pandas = "*"
 reportlab = "*"
 gevent = ">=24.2.1,<26.0.0"

-[tool.coverage.run]
-concurrency = ["gevent"]
-
-
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
 jupyter_kernel_gateway = "*"
 flake8 = "*"

-[build-system]
-build-backend = "poetry.core.masonry.api"
-requires = [
-  "poetry-core",
-]
-
-[tool.autopep8]
-# autopep8 fights with mypy on line length issue
-ignore = [ "E501" ]
-
-[tool.black]
-# prevent black (if installed) from changing single quotes to double quotes
-skip-string-normalization = true
-
-[tool.ruff.lint]
-select = ["D"]
-# ignore warnings for missing docstrings
-ignore = ["D1"]
-
-[tool.ruff.lint.pydocstyle]
-convention = "google"
-
-
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
@@ -148,14 +127,10 @@ browsergym = "0.13.3"
 browsergym-webarena = "0.13.3"
 browsergym-miniwob = "0.13.3"
 browsergym-visualwebarena = "0.13.3"
-boto3-stubs = {extras = ["s3"], version = "^1.37.19"}
+boto3-stubs = { extras = [ "s3" ], version = "^1.37.19" }
 pyarrow = "20.0.0"                                                    # transitive dependency, pinned here to avoid conflicts
 datasets = "*"

-[tool.poetry-dynamic-versioning]
-enable = true
-style = "semver"
-
 [tool.poetry.scripts]
 openhands = "openhands.core.cli:main"

@@ -164,3 +139,24 @@ fuzzywuzzy = "^0.18.0"
 rouge = "^1.0.1"
 python-levenshtein = ">=0.26.1,<0.28.0"
 tree-sitter-python = "^0.23.6"
+
+[tool.poetry-dynamic-versioning]
+enable = true
+style = "semver"
+
+[tool.autopep8]
+# autopep8 fights with mypy on line length issue
+ignore = [ "E501" ]
+
+[tool.black]
+# prevent black (if installed) from changing single quotes to double quotes
+skip-string-normalization = true
+
+[tool.ruff]
+lint.select = [ "D" ]
+# ignore warnings for missing docstrings
+lint.ignore = [ "D1" ]
+lint.pydocstyle.convention = "google"
+
+[tool.coverage.run]
+concurrency = [ "gevent" ]
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@@ -760,9 +760,9 @@ def test_python_version(temp_dir, runtime_cls, run_as_openhands):
    try:
        obs = runtime.run_action(CmdRunAction(command='python --version'))

-        assert isinstance(
-            obs, CmdOutputObservation
-        ), 'The observation should be a CmdOutputObservation.'
+        assert isinstance(obs, CmdOutputObservation), (
+            'The observation should be a CmdOutputObservation.'
+        )
        assert obs.exit_code == 0, 'The exit code should be 0.'
        assert 'Python 3' in obs.content, 'The output should contain "Python 3".'
    finally:
--- a/tests/runtime/test_env_vars.py
+++ b/tests/runtime/test_env_vars.py
@@ -25,9 +25,9 @@ def test_env_vars_os_environ(temp_dir, runtime_cls, run_as_openhands):
        )
        print(obs)
        assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert (
-            obs.content.strip().split('\n\r')[0].strip() == 'BAZ'
-        ), f'Output: [{obs.content}] for {runtime_cls}'
+        assert obs.content.strip().split('\n\r')[0].strip() == 'BAZ', (
+            f'Output: [{obs.content}] for {runtime_cls}'
+        )

        _close_test_runtime(runtime)

--- a/tests/runtime/test_glob_and_grep.py
+++ b/tests/runtime/test_glob_and_grep.py
@@ -168,9 +168,9 @@ def test_grep_to_cmdrun_paths_with_spaces(runtime_cls, run_as_openhands, temp_di

            obs = _run_cmd_action(runtime, cmd)
            assert obs.exit_code == 0, f'Grep command failed for path: {path}'
-            assert (
-                'function' in obs.content
-            ), f'Expected pattern not found in output for path: {path}'
+            assert 'function' in obs.content, (
+                f'Expected pattern not found in output for path: {path}'
+            )

            # Verify the actual file was found
            if path == 'src/my project':
--- a/tests/runtime/test_ipython.py
+++ b/tests/runtime/test_ipython.py
@@ -77,9 +77,9 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
    action_read = FileReadAction(path='hello.sh')
    logger.info(action_read, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action_read)
-    assert isinstance(
-        obs, FileReadObservation
-    ), 'The observation should be a FileReadObservation.'
+    assert isinstance(obs, FileReadObservation), (
+        'The observation should be a FileReadObservation.'
+    )
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

    assert obs.content == 'echo "Hello, World!"\n'
--- a/tests/runtime/test_llm_based_edit.py
+++ b/tests/runtime/test_llm_based_edit.py
@@ -39,9 +39,9 @@ def test_edit_from_scratch(temp_dir, runtime_cls, run_as_openhands):
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-        assert isinstance(
-            obs, FileEditObservation
-        ), 'The observation should be a FileEditObservation.'
+        assert isinstance(obs, FileEditObservation), (
+            'The observation should be a FileEditObservation.'
+        )

        action = FileReadAction(
            path=os.path.join('/workspace', 'app.py'),
@@ -78,9 +78,9 @@ def test_edit(temp_dir, runtime_cls, run_as_openhands):
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-        assert isinstance(
-            obs, FileEditObservation
-        ), 'The observation should be a FileEditObservation.'
+        assert isinstance(obs, FileEditObservation), (
+            'The observation should be a FileEditObservation.'
+        )

        action = FileReadAction(
            path=os.path.join('/workspace', 'app.py'),
@@ -138,9 +138,9 @@ def test_edit_long_file(temp_dir, runtime_cls, run_as_openhands):
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})

-        assert isinstance(
-            obs, FileEditObservation
-        ), 'The observation should be a FileEditObservation.'
+        assert isinstance(obs, FileEditObservation), (
+            'The observation should be a FileEditObservation.'
+        )

        action = FileReadAction(
            path=os.path.join('/workspace', 'app.py'),
--- a/tests/runtime/test_mcp_action.py
+++ b/tests/runtime/test_mcp_action.py
@@ -23,9 +23,9 @@ from openhands.events.observation import CmdOutputObservation, MCPObservation
 def test_default_activated_tools():
    project_root = os.path.dirname(openhands.__file__)
    mcp_config_path = os.path.join(project_root, 'runtime', 'mcp', 'config.json')
-    assert os.path.exists(
-        mcp_config_path
-    ), f'MCP config file not found at {mcp_config_path}'
+    assert os.path.exists(mcp_config_path), (
+        f'MCP config file not found at {mcp_config_path}'
+    )
    with open(mcp_config_path, 'r') as f:
        mcp_config = json.load(f)
    assert 'default' in mcp_config
@@ -63,9 +63,9 @@ async def test_fetch_mcp_via_stdio(temp_dir, runtime_cls, run_as_openhands):
    mcp_action = MCPAction(name='fetch', arguments={'url': 'http://localhost:8000'})
    obs = await runtime.call_tool_mcp(mcp_action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert isinstance(
-        obs, MCPObservation
-    ), 'The observation should be a MCPObservation.'
+    assert isinstance(obs, MCPObservation), (
+        'The observation should be a MCPObservation.'
+    )

    result_json = json.loads(obs.content)
    assert not result_json['isError']
--- a/tests/runtime/test_stress_remote_runtime.py
+++ b/tests/runtime/test_stress_remote_runtime.py
@@ -468,9 +468,9 @@ def test_stress_runtime_memory_limits_with_repeated_file_edit():
                new_str=f'-content_{i:03d}',
            )
            obs = runtime.run_action(edit_action)
-            assert (
-                f'The file {test_file} has been edited' in obs.content
-            ), f'Edit failed at iteration {i}'
+            assert f'The file {test_file} has been edited' in obs.content, (
+                f'Edit failed at iteration {i}'
+            )
            logger.info(f'finished iteration {i}')

        # Verify final file state using FileEditAction view command
--- a/tests/unit/resolver/github/test_issue_handler_error_handling.py
+++ b/tests/unit/resolver/github/test_issue_handler_error_handling.py
@@ -240,7 +240,9 @@ def test_guess_success_rate_limit_wait_time(mock_litellm_completion, default_con
        wait_time = mock_sleep.call_args[0][0]
        assert (
            default_config.retry_min_wait <= wait_time <= default_config.retry_max_wait
-        ), f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
+        ), (
+            f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
+        )


@patch('openhands.llm.llm.litellm_completion')
--- a/tests/unit/resolver/github/test_pr_title_escaping.py
+++ b/tests/unit/resolver/github/test_pr_title_escaping.py
@@ -71,9 +71,9 @@ def test_pr_title_with_quotes(monkeypatch):
        data = kwargs.get('json', {})
        title = data.get('title', '')
        expected = "Fix issue #123: Issue with 'quotes' and \"double quotes\" and <class 'ValueError'>"
-        assert (
-            title == expected
-        ), f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
+        assert title == expected, (
+            f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
+        )
        return MockResponse()

    class MockGetResponse:
@@ -98,7 +98,7 @@ def test_pr_title_with_quotes(monkeypatch):
    original_run = subprocess.run

    def mock_run(*args, **kwargs):
-        print(f"Running command: {args[0] if args else kwargs.get('args', [])}")
+        print(f'Running command: {args[0] if args else kwargs.get("args", [])}')
        if isinstance(args[0], list) and args[0][0] == 'git':
            if 'push' in args[0]:
                return subprocess.CompletedProcess(
--- a/tests/unit/resolver/github/test_resolve_issues.py
+++ b/tests/unit/resolver/github/test_resolve_issues.py
@@ -478,13 +478,14 @@ async def test_process_issue(
        mock_run_controller.return_value = test_case['run_controller_return']

    # Patch the necessary functions and methods
-    with patch(
-        'openhands.resolver.resolve_issue.create_runtime', mock_create_runtime
-    ), patch(
-        'openhands.resolver.resolve_issue.run_controller', mock_run_controller
-    ), patch.object(
+    with (
+        patch('openhands.resolver.resolve_issue.create_runtime', mock_create_runtime),
+        patch('openhands.resolver.resolve_issue.run_controller', mock_run_controller),
+        patch.object(
            resolver, 'complete_runtime', return_value={'git_patch': 'test patch'}
-    ), patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime:
+        ),
+        patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime,
+    ):
        # Call the process_issue method
        result = await resolver.process_issue(issue, base_commit, handler_instance)

--- a/tests/unit/resolver/github/test_send_pull_request.py
+++ b/tests/unit/resolver/github/test_send_pull_request.py
@@ -142,9 +142,9 @@ index 9daeafb..b02def2 100644
    with open(dos_file, 'rb') as f:
        dos_content = f.read()

-    assert (
-        b'\r\n' not in unix_content
-    ), 'Unix-style line endings were changed to DOS-style'
+    assert b'\r\n' not in unix_content, (
+        'Unix-style line endings were changed to DOS-style'
+    )
    assert b'\r\n' in dos_content, 'DOS-style line endings were changed to Unix-style'

    # Check if content was updated correctly
--- a/tests/unit/resolver/gitlab/test_gitlab_issue_handler_error_handling.py
+++ b/tests/unit/resolver/gitlab/test_gitlab_issue_handler_error_handling.py
@@ -242,7 +242,9 @@ def test_guess_success_rate_limit_wait_time(mock_litellm_completion, default_con
        wait_time = mock_sleep.call_args[0][0]
        assert (
            default_config.retry_min_wait <= wait_time <= default_config.retry_max_wait
-        ), f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
+        ), (
+            f'Expected wait time between {default_config.retry_min_wait} and {default_config.retry_max_wait} seconds, but got {wait_time}'
+        )


@patch('openhands.llm.llm.litellm_completion')
--- a/tests/unit/resolver/gitlab/test_gitlab_pr_title_escaping.py
+++ b/tests/unit/resolver/gitlab/test_gitlab_pr_title_escaping.py
@@ -72,9 +72,9 @@ def test_pr_title_with_quotes(monkeypatch):
        data = kwargs.get('json', {})
        title = data.get('title', '')
        expected = "Fix issue #123: Issue with 'quotes' and \"double quotes\" and <class 'ValueError'>"
-        assert (
-            title == expected
-        ), f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
+        assert title == expected, (
+            f'PR title was incorrectly escaped.\nExpected: {expected}\nGot: {title}'
+        )
        return MockResponse()

    class MockGetResponse:
@@ -99,7 +99,7 @@ def test_pr_title_with_quotes(monkeypatch):
    original_run = subprocess.run

    def mock_run(*args, **kwargs):
-        logger.info(f"Running command: {args[0] if args else kwargs.get('args', [])}")
+        logger.info(f'Running command: {args[0] if args else kwargs.get("args", [])}')
        if isinstance(args[0], list) and args[0][0] == 'git':
            if 'push' in args[0]:
                return subprocess.CompletedProcess(
--- a/tests/unit/resolver/gitlab/test_gitlab_resolve_issues.py
+++ b/tests/unit/resolver/gitlab/test_gitlab_resolve_issues.py
@@ -506,15 +506,18 @@ async def test_process_issue(
        mock_run_controller.return_value = test_case['run_controller_return']

    # Patch the necessary functions and methods
-    with patch(
-        'openhands.resolver.resolve_issue.create_runtime', mock_create_runtime
-    ), patch(
-        'openhands.resolver.resolve_issue.run_controller', mock_run_controller
-    ), patch.object(
+    with (
+        patch('openhands.resolver.resolve_issue.create_runtime', mock_create_runtime),
+        patch('openhands.resolver.resolve_issue.run_controller', mock_run_controller),
+        patch.object(
            resolver, 'complete_runtime', return_value={'git_patch': 'test patch'}
-    ), patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime, patch(
+        ),
+        patch.object(resolver, 'initialize_runtime') as mock_initialize_runtime,
+        patch(
            'openhands.resolver.resolve_issue.SandboxConfig', return_value=MagicMock()
-    ), patch('openhands.resolver.resolve_issue.AppConfig', return_value=MagicMock()):
+        ),
+        patch('openhands.resolver.resolve_issue.AppConfig', return_value=MagicMock()),
+    ):
        # Call the process_issue method
        result = await resolver.process_issue(issue, base_commit, handler_instance)

--- a/Show More
+++ b/Show More