feat(agent, CodeAct 2.2): native CodeAct support for Browsing (#4667)

Co-authored-by: tofarr <tofarr@gmail.com>
2026-01-09 14:57:59 -05:00 · 2024-11-04 10:27:27 -06:00
parent f0af90bff3
commit 966da7b7c8
12 changed files with 346 additions and 55 deletions
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -55,18 +56,14 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
-    if metadata.llm_config.log_completions:
+    config.set_llm_config(
-        metadata.llm_config.log_completions_folder = os.path.join(
+        update_llm_config_for_completions_logging(
-            metadata.eval_output_dir, 'llm_completions', instance_id
+            metadata.llm_config, metadata.eval_output_dir, instance_id
        )
-        logger.info(
+    )
            f'Logging LLM completions for instance {instance_id} to '
            f'{metadata.llm_config.log_completions_folder}'
        )
    config.set_llm_config(metadata.llm_config)
    agent_config = AgentConfig(
        codeact_enable_jupyter=True,
-        codeact_enable_browsing_delegate=True,
+        codeact_enable_browsing=True,
        codeact_enable_llm_editor=False,
    )
    config.set_agent_config(agent_config)
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -0,0 +1,44 @@
 from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
 from openhands.events.action import AgentFinishAction, MessageAction
 from openhands.events.event import Event
 from openhands.events.observation import AgentDelegateObservation
 from openhands.runtime.base import Runtime
 class Test(BaseIntegrationTest):
    INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
    @classmethod
    def initialize_runtime(cls, runtime: Runtime) -> None:
        pass
    @classmethod
    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
        # check if the "The answer is OpenHands is all you need!" is in any message
        message_actions = [
            event
            for event in histories
            if isinstance(
                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
            )
        ]
        for event in message_actions:
            if isinstance(event, AgentDelegateObservation):
                content = event.content
            elif isinstance(event, AgentFinishAction):
                content = event.outputs.get('content', '')
            elif isinstance(event, MessageAction):
                content = event.content
            else:
                raise ValueError(f'Unknown event type: {type(event)}')
            if (
                'non-commercial' in content
                or 'MIT' in content
                or 'Apache 2.0' in content
            ):
                return TestResult(success=True)
        return TestResult(
            success=False,
            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
        )
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -10,10 +10,12 @@ import pandas as pd
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -29,7 +31,10 @@ from openhands.events.action import (
    CmdRunAction,
    MessageAction,
 )
-from openhands.events.observation import CmdOutputObservation
+from openhands.events.observation import (
    BrowserOutputObservation,
    CmdOutputObservation,
 )
 from openhands.runtime.base import Runtime
 from openhands.runtime.browser.browser_env import (
    BROWSER_EVAL_GET_GOAL_ACTION,
@@ -37,7 +42,11 @@ from openhands.runtime.browser.browser_env import (
 )
 from openhands.utils.async_utils import call_async_from_sync
-SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
 }
 def get_config(
@@ -47,25 +56,32 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            base_container_image='xingyaoww/od-eval-miniwob:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
            browsergym_eval_env=env_id,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
            keep_remote_runtime_alive=False,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
    )
-    config.set_llm_config(metadata.llm_config)
+    config.set_llm_config(
        update_llm_config_for_completions_logging(
            metadata.llm_config, metadata.eval_output_dir, env_id
        )
    )
    return config
 def initialize_runtime(
    runtime: Runtime,
-) -> str:
+) -> tuple[str, BrowserOutputObservation]:
    """Initialize the runtime for the agent.
    This function is called before the runtime is used to run the agent.
@@ -85,8 +101,14 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    goal = obs.content
    # Run noop to get the initial browser observation (e.g., the page URL & content)
    action = BrowseInteractiveAction(browser_actions='noop(1000)')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
-    return goal
+    return goal, obs
 def complete_runtime(
@@ -117,7 +139,7 @@ def process_instance(
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ) -> EvalOutput:
-    env_id = instance.id
+    env_id = instance.instance_id
    config = get_config(metadata, env_id)
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
@@ -129,7 +151,12 @@ def process_instance(
    runtime = create_runtime(config)
    call_async_from_sync(runtime.connect)
-    task_str = initialize_runtime(runtime)
+    task_str, obs = initialize_runtime(runtime)
    task_str += (
        f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}'
    )
    state: State | None = asyncio.run(
        run_controller(
            config=config,
@@ -137,6 +164,9 @@ def process_instance(
                content=task_str
            ),  # take output from initialize_runtime
            runtime=runtime,
            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                metadata.agent_class
            ],
        )
    )
@@ -159,7 +189,7 @@ def process_instance(
    return_val = complete_runtime(runtime)
    logger.info(f'Return value from complete_runtime: {return_val}')
-    reward = max(return_val['rewards'])
+    reward = max(return_val['rewards'], default=0)
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
--- a/evaluation/scienceagentbench/run_infer.py
+++ b/evaluation/scienceagentbench/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -76,15 +77,13 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
-    config.set_llm_config(metadata.llm_config)
+    config.set_llm_config(
-    if metadata.llm_config.log_completions:
+        update_llm_config_for_completions_logging(
-        metadata.llm_config.log_completions_folder = os.path.join(
+            metadata.llm_config,
-            metadata.eval_output_dir, 'llm_completions', instance_id
+            metadata.eval_output_dir,
-        )
+            instance_id,
        logger.info(
            f'Logging LLM completions for instance {instance_id} to '
            f'{metadata.llm_config.log_completions_folder}'
        )
    )
    return config
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -40,6 +41,7 @@ from openhands.utils.async_utils import call_async_from_sync
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -88,6 +90,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
            '5. Think about edgecases and make sure your fix handles them as well\n'
            "Your thinking should be thorough and so it's fine if it's very long.\n"
        )
    if RUN_WITH_BROWSING:
        instruction += (
            '<IMPORTANT!>\n'
            'You SHOULD NEVER attempt to browse the web. '
            '</IMPORTANT!>\n'
        )
    return instruction
@@ -142,18 +151,14 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
-    if metadata.llm_config.log_completions:
+    config.set_llm_config(
-        metadata.llm_config.log_completions_folder = os.path.join(
+        update_llm_config_for_completions_logging(
-            metadata.eval_output_dir, 'llm_completions', instance['instance_id']
+            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
        )
-        logger.info(
+    )
            f'Logging LLM completions for instance {instance["instance_id"]} to '
            f'{metadata.llm_config.log_completions_folder}'
        )
    config.set_llm_config(metadata.llm_config)
    agent_config = AgentConfig(
        codeact_enable_jupyter=False,
-        codeact_enable_browsing_delegate=False,
+        codeact_enable_browsing=RUN_WITH_BROWSING,
        codeact_enable_llm_editor=False,
    )
    config.set_agent_config(agent_config)
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
  USE_INSTANCE_IMAGE=true
 fi
 if [ -z "$RUN_WITH_BROWSING" ]; then
  echo "RUN_WITH_BROWSING not specified, use default false"
  RUN_WITH_BROWSING=false
 fi
 if [ -z "$DATASET" ]; then
  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
@@ -47,6 +52,8 @@ fi
 export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
 echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
 get_agent_version
@@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then
  EVAL_NOTE="$EVAL_NOTE-no-hint"
 fi
 if [ "$RUN_WITH_BROWSING" = true ]; then
  EVAL_NOTE="$EVAL_NOTE-with-browsing"
 fi
 if [ -n "$EXP_NAME" ]; then
  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
 fi
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -411,3 +411,20 @@ def reset_logger_for_multiprocessing(
    )
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
 def update_llm_config_for_completions_logging(
    llm_config: LLMConfig,
    eval_output_dir: str,
    instance_id: str,
 ) -> LLMConfig:
    """Update the LLM config for logging completions."""
    if llm_config.log_completions:
        llm_config.log_completions_folder = os.path.join(
            eval_output_dir, 'llm_completions', instance_id
        )
        logger.info(
            f'Logging LLM completions for instance {instance_id} to '
            f'{llm_config.log_completions_folder}'
        )
    return llm_config
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -16,6 +16,7 @@ from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
    BrowseInteractiveAction,
    CmdRunAction,
    FileEditAction,
    IPythonRunCellAction,
@@ -23,6 +24,7 @@ from openhands.events.action import (
 )
 from openhands.events.observation import (
    AgentDelegateObservation,
    BrowserOutputObservation,
    CmdOutputObservation,
    FileEditObservation,
    IPythonRunCellObservation,
@@ -42,7 +44,7 @@ from openhands.utils.prompt import PromptManager
 class CodeActAgent(Agent):
-    VERSION = '2.1'
+    VERSION = '2.2'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -105,7 +107,7 @@ class CodeActAgent(Agent):
        if self.function_calling_active:
            # Function calling mode
            self.tools = codeact_function_calling.get_tools(
-                codeact_enable_browsing_delegate=self.config.codeact_enable_browsing_delegate,
+                codeact_enable_browsing=self.config.codeact_enable_browsing,
                codeact_enable_jupyter=self.config.codeact_enable_jupyter,
                codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
            )
@@ -142,10 +144,10 @@ class CodeActAgent(Agent):
        Args:
            action (Action): The action to convert. Can be one of:
                - AgentDelegateAction: For delegating tasks to other agents
                - CmdRunAction: For executing bash commands
                - IPythonRunCellAction: For running IPython code
                - FileEditAction: For editing files
                - BrowseInteractiveAction: For browsing the web
                - AgentFinishAction: For ending the interaction
                - MessageAction: For sending messages
            pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
@@ -169,6 +171,7 @@ class CodeActAgent(Agent):
                CmdRunAction,
                IPythonRunCellAction,
                FileEditAction,
                BrowseInteractiveAction,
            ),
        ) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
            if self.function_calling_active:
@@ -192,6 +195,10 @@ class CodeActAgent(Agent):
                )
                return []
            else:
                assert not isinstance(action, BrowseInteractiveAction), (
                    'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
                    + str(action)
                )
                content = [TextContent(text=self.action_parser.action_to_str(action))]
                return [
                    Message(
@@ -266,6 +273,12 @@ class CodeActAgent(Agent):
        elif isinstance(obs, FileEditObservation):
            text = obs_prefix + truncate_content(str(obs), max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, BrowserOutputObservation):
            text = obs.get_agent_obs_text()
            message = Message(
                role='user',
                content=[TextContent(text=obs_prefix + text)],
            )
        elif isinstance(obs, AgentDelegateObservation):
            text = obs_prefix + truncate_content(
                obs.outputs['content'] if 'content' in obs.outputs else '',
@@ -335,6 +348,7 @@ class CodeActAgent(Agent):
        }
        if self.function_calling_active:
            params['tools'] = self.tools
            params['parallel_tool_calls'] = False
        else:
            params['stop'] = [
                '</execute_ipython>',
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -5,6 +5,7 @@ This is similar to the functionality of `CodeActResponseParser`.
 import json
 from browsergym.core.action.highlevel import HighLevelActionSet
 from litellm import (
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
@@ -16,6 +17,7 @@ from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
    BrowseInteractiveAction,
    CmdRunAction,
    FileEditAction,
    IPythonRunCellAction,
@@ -272,24 +274,146 @@ StrReplaceEditorTool = ChatCompletionToolParam(
    ),
 )
-_BROWSER_DELEGATION = """Delegate the task to another browsing agent.
+# from browsergym/core/action/highlevel.py
-The assistant should delegate the task if it needs to browse the Internet.
+_browser_action_space = HighLevelActionSet(
    subsets=['bid', 'nav'],
    strict=False,  # less strict on the parsing of the actions
    multiaction=True,  # enable to agent to take multiple actions at once
 )
 _BROWSER_DESCRIPTION = """Interact with the browser using Python code.
 The following 15 functions are available. Nothing else is supported.
 goto(url: str)
    Description: Navigate to a url.
    Examples:
        goto('http://www.example.com')
 go_back()
    Description: Navigate to the previous page in history.
    Examples:
        go_back()
 go_forward()
    Description: Navigate to the next page in history.
    Examples:
        go_forward()
 noop(wait_ms: float = 1000)
    Description: Do nothing, and optionally wait for the given time (in milliseconds).
    You can use this to get the current page content and/or wait for the page to load.
    Examples:
        noop()
        noop(500)
 scroll(delta_x: float, delta_y: float)
    Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
    Examples:
        scroll(0, 200)
        scroll(-50.2, -100.5)
 fill(bid: str, value: str)
    Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
    Examples:
        fill('237', 'example value')
        fill('45', 'multi-line\nexample')
        fill('a12', 'example with "quotes"')
 select_option(bid: str, options: str | list[str])
    Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
    Examples:
        select_option('a48', 'blue')
        select_option('c48', ['red', 'green', 'blue'])
 click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
    Description: Click an element.
    Examples:
        click('a51')
        click('b22', button='right')
        click('48', button='middle', modifiers=['Shift'])
 dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
    Description: Double click an element.
    Examples:
        dblclick('12')
        dblclick('ca42', button='right')
        dblclick('178', button='middle', modifiers=['Shift'])
 hover(bid: str)
    Description: Hover over an element.
    Examples:
        hover('b8')
 press(bid: str, key_comb: str)
    Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
    Examples:
        press('88', 'Backspace')
        press('a26', 'ControlOrMeta+a')
        press('a61', 'Meta+Shift+t')
 focus(bid: str)
    Description: Focus the matching element.
    Examples:
        focus('b455')
 clear(bid: str)
    Description: Clear the input field.
    Examples:
        clear('996')
 drag_and_drop(from_bid: str, to_bid: str)
    Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
    Examples:
        drag_and_drop('56', '498')
 upload_file(bid: str, file: str | list[str])
    Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
    Examples:
        upload_file('572', '/home/user/my_receipt.pdf')
        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
 Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
 More than 2-3 actions usually leads to failure or unexpected behavior. Example:
 fill('a12', 'example with "quotes"')
 click('a51')
 click('48', button='middle', modifiers=['Shift'])
 """
-BrowserDelegationTool = ChatCompletionToolParam(
+for _, action in _browser_action_space.action_set.items():
    assert (
        action.signature in _BROWSER_DESCRIPTION
    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
    assert (
        action.description in _BROWSER_DESCRIPTION
    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
 BrowserTool = ChatCompletionToolParam(
    type='function',
    function=ChatCompletionToolParamFunctionChunk(
-        name='delegate_to_browsing_agent',
+        name='browser',
-        description=_BROWSER_DELEGATION,
+        description=_BROWSER_DESCRIPTION,
        parameters={
            'type': 'object',
            'properties': {
-                'task': {
+                'code': {
                    'type': 'string',
-                    'description': 'The task for the browsing agent to execute. It should include all the necessary context and specify what information the browsing agent should return.',
+                    'description': 'The Python code that interacts with the browser.',
-                },
+                }
            },
-            'required': ['task'],
+            'required': ['code'],
        },
    ),
 )
@@ -357,6 +481,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                    f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
                )
                action = IPythonRunCellAction(code=code, include_extra=False)
            elif tool_call.function.name == 'browser':
                action = BrowseInteractiveAction(browser_actions=arguments['code'])
            else:
                raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
@@ -381,13 +507,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
 def get_tools(
-    codeact_enable_browsing_delegate: bool = False,
+    codeact_enable_browsing: bool = False,
    codeact_enable_llm_editor: bool = False,
    codeact_enable_jupyter: bool = False,
 ) -> list[ChatCompletionToolParam]:
    tools = [CmdRunTool, FinishTool]
-    if codeact_enable_browsing_delegate:
+    if codeact_enable_browsing:
-        tools.append(BrowserDelegationTool)
+        tools.append(BrowserTool)
    if codeact_enable_jupyter:
        tools.append(IPythonTool)
    if codeact_enable_llm_editor:
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -9,7 +9,7 @@ class AgentConfig:
    Attributes:
        function_calling: Whether function calling is enabled. Default is True.
-        codeact_enable_browsing_delegate: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
+        codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
        codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
        codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
        micro_agent_name: The name of the micro agent to use for this agent.
@@ -19,7 +19,7 @@ class AgentConfig:
    """
    function_calling: bool = True
-    codeact_enable_browsing_delegate: bool = True
+    codeact_enable_browsing: bool = True
    codeact_enable_llm_editor: bool = False
    codeact_enable_jupyter: bool = True
    micro_agent_name: str | None = None
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -1,5 +1,7 @@
 from dataclasses import dataclass, field
 from browsergym.utils.obs import flatten_axtree_to_str
 from openhands.core.schema import ObservationType
 from openhands.events.observation.observation import Observation
@@ -29,7 +31,7 @@ class BrowserOutputObservation(Observation):
        return 'Visited ' + self.url
    def __str__(self) -> str:
-        return (
+        ret = (
            '**BrowserOutputObservation**\n'
            f'URL: {self.url}\n'
            f'Error: {self.error}\n'
@@ -38,5 +40,47 @@ class BrowserOutputObservation(Observation):
            f'Last browser action: {self.last_browser_action}\n'
            f'Last browser action error: {self.last_browser_action_error}\n'
            f'Focused element bid: {self.focused_element_bid}\n'
-            f'CONTENT: {self.content}\n'
+            f'Content: {self.content}\n'
        )
        ret += '--- Agent Observation ---\n'
        ret += self.get_agent_obs_text()
        return ret
    def get_agent_obs_text(self) -> str:
        """Get a concise text that will be shown to the agent."""
        text = f'[Current URL: {self.url}]\n'
        text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
        if self.error:
            text += (
                '================ BEGIN error message ===============\n'
                'The following error occurred when executing the last action:\n'
                f'{self.last_browser_action_error}\n'
                '================ END error message ===============\n'
            )
        else:
            text += '[Action executed successfully.]\n'
        try:
            # We do not filter visible only here because we want to show the full content
            # of the web page to the agent for simplicity.
            # FIXME: handle the case when the web page is too large
            cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
            text += (
                f'============== BEGIN accessibility tree ==============\n'
                f'{cur_axtree_txt}\n'
                f'============== END accessibility tree ==============\n'
            )
        except Exception as e:
            text += f'\n[Error encountered when processing the accessibility tree: {e}]'
        return text
    def get_axtree_str(self, filter_visible_only: bool = False) -> str:
        cur_axtree_txt = flatten_axtree_to_str(
            self.axtree_object,
            extra_properties=self.extra_element_properties,
            with_clickable=True,
            skip_generic=False,
            filter_visible_only=filter_visible_only,
        )
        self._axtree_str = cur_axtree_txt
        return cur_axtree_txt
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@@ -81,7 +81,10 @@ class BrowserEnv:
                raise ValueError(
                    f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
                )
-            env = gym.make(self.browsergym_eval_env)
+            env = gym.make(
                self.browsergym_eval_env,
                tags_to_mark='all',
            )
        else:
            env = gym.make(
                'browsergym/openended',
@@ -89,6 +92,7 @@ class BrowserEnv:
                wait_for_user_message=False,
                headless=True,
                disable_env_checker=True,
                tags_to_mark='all',
            )
        obs, info = env.reset()