feat(agent, CodeAct 2.2): native CodeAct support for Browsing (#4667)

Co-authored-by: tofarr <tofarr@gmail.com>
2026-01-09 23:08:04 -05:00 · 2024-11-04 10:27:27 -06:00
parent f0af90bff3
commit 966da7b7c8
12 changed files with 346 additions and 55 deletions
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -55,18 +56,14 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
-    if metadata.llm_config.log_completions:
-        metadata.llm_config.log_completions_folder = os.path.join(
-            metadata.eval_output_dir, 'llm_completions', instance_id
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, instance_id
        )
-        logger.info(
-            f'Logging LLM completions for instance {instance_id} to '
-            f'{metadata.llm_config.log_completions_folder}'
-        )
-    config.set_llm_config(metadata.llm_config)
+    )
    agent_config = AgentConfig(
        codeact_enable_jupyter=True,
-        codeact_enable_browsing_delegate=True,
+        codeact_enable_browsing=True,
        codeact_enable_llm_editor=False,
    )
    config.set_agent_config(agent_config)
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -0,0 +1,44 @@
+from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
+from openhands.events.action import AgentFinishAction, MessageAction
+from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
+from openhands.runtime.base import Runtime
+
+
+class Test(BaseIntegrationTest):
+    INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
+
+    @classmethod
+    def initialize_runtime(cls, runtime: Runtime) -> None:
+        pass
+
+    @classmethod
+    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
+        # check if the "The answer is OpenHands is all you need!" is in any message
+        message_actions = [
+            event
+            for event in histories
+            if isinstance(
+                event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
+            )
+        ]
+        for event in message_actions:
+            if isinstance(event, AgentDelegateObservation):
+                content = event.content
+            elif isinstance(event, AgentFinishAction):
+                content = event.outputs.get('content', '')
+            elif isinstance(event, MessageAction):
+                content = event.content
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')
+
+            if (
+                'non-commercial' in content
+                or 'MIT' in content
+                or 'Apache 2.0' in content
+            ):
+                return TestResult(success=True)
+        return TestResult(
+            success=False,
+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
+        )
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -10,10 +10,12 @@ import pandas as pd
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    codeact_user_response,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -29,7 +31,10 @@ from openhands.events.action import (
    CmdRunAction,
    MessageAction,
 )
-from openhands.events.observation import CmdOutputObservation
+from openhands.events.observation import (
+    BrowserOutputObservation,
+    CmdOutputObservation,
+)
 from openhands.runtime.base import Runtime
 from openhands.runtime.browser.browser_env import (
    BROWSER_EVAL_GET_GOAL_ACTION,
@@ -37,7 +42,11 @@ from openhands.runtime.browser.browser_env import (
 )
 from openhands.utils.async_utils import call_async_from_sync

-SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}


 def get_config(
@@ -47,25 +56,32 @@ def get_config(
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
-        runtime='eventstream',
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            base_container_image='xingyaoww/od-eval-miniwob:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
            browsergym_eval_env=env_id,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_remote_runtime_alive=False,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
    )
-    config.set_llm_config(metadata.llm_config)
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, env_id
+        )
+    )
    return config


 def initialize_runtime(
    runtime: Runtime,
-) -> str:
+) -> tuple[str, BrowserOutputObservation]:
    """Initialize the runtime for the agent.

    This function is called before the runtime is used to run the agent.
@@ -85,8 +101,14 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    goal = obs.content

+    # Run noop to get the initial browser observation (e.g., the page URL & content)
+    action = BrowseInteractiveAction(browser_actions='noop(1000)')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
-    return goal
+    return goal, obs


 def complete_runtime(
@@ -117,7 +139,7 @@ def process_instance(
    metadata: EvalMetadata,
    reset_logger: bool = True,
 ) -> EvalOutput:
-    env_id = instance.id
+    env_id = instance.instance_id
    config = get_config(metadata, env_id)

    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
@@ -129,7 +151,12 @@ def process_instance(

    runtime = create_runtime(config)
    call_async_from_sync(runtime.connect)
-    task_str = initialize_runtime(runtime)
+    task_str, obs = initialize_runtime(runtime)
+
+    task_str += (
+        f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}'
+    )
+
    state: State | None = asyncio.run(
        run_controller(
            config=config,
@@ -137,6 +164,9 @@ def process_instance(
                content=task_str
            ),  # take output from initialize_runtime
            runtime=runtime,
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                metadata.agent_class
+            ],
        )
    )

@@ -159,7 +189,7 @@ def process_instance(

    return_val = complete_runtime(runtime)
    logger.info(f'Return value from complete_runtime: {return_val}')
-    reward = max(return_val['rewards'])
+    reward = max(return_val['rewards'], default=0)

    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
--- a/evaluation/scienceagentbench/run_infer.py
+++ b/evaluation/scienceagentbench/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -76,15 +77,13 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
-    config.set_llm_config(metadata.llm_config)
-    if metadata.llm_config.log_completions:
-        metadata.llm_config.log_completions_folder = os.path.join(
-            metadata.eval_output_dir, 'llm_completions', instance_id
-        )
-        logger.info(
-            f'Logging LLM completions for instance {instance_id} to '
-            f'{metadata.llm_config.log_completions_folder}'
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config,
+            metadata.eval_output_dir,
+            instance_id,
        )
+    )
    return config


--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
+    update_llm_config_for_completions_logging,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
@@ -40,6 +41,7 @@ from openhands.utils.async_utils import call_async_from_sync

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
+RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'

 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
@@ -88,6 +90,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
            '5. Think about edgecases and make sure your fix handles them as well\n'
            "Your thinking should be thorough and so it's fine if it's very long.\n"
        )
+
+    if RUN_WITH_BROWSING:
+        instruction += (
+            '<IMPORTANT!>\n'
+            'You SHOULD NEVER attempt to browse the web. '
+            '</IMPORTANT!>\n'
+        )
    return instruction


@@ -142,18 +151,14 @@ def get_config(
        workspace_base=None,
        workspace_mount_path=None,
    )
-    if metadata.llm_config.log_completions:
-        metadata.llm_config.log_completions_folder = os.path.join(
-            metadata.eval_output_dir, 'llm_completions', instance['instance_id']
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
        )
-        logger.info(
-            f'Logging LLM completions for instance {instance["instance_id"]} to '
-            f'{metadata.llm_config.log_completions_folder}'
-        )
-    config.set_llm_config(metadata.llm_config)
+    )
    agent_config = AgentConfig(
        codeact_enable_jupyter=False,
-        codeact_enable_browsing_delegate=False,
+        codeact_enable_browsing=RUN_WITH_BROWSING,
        codeact_enable_llm_editor=False,
    )
    config.set_agent_config(agent_config)
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
  USE_INSTANCE_IMAGE=true
 fi

+if [ -z "$RUN_WITH_BROWSING" ]; then
+  echo "RUN_WITH_BROWSING not specified, use default false"
+  RUN_WITH_BROWSING=false
+fi
+

 if [ -z "$DATASET" ]; then
  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
@@ -47,6 +52,8 @@ fi

 export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
 echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
+export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
+echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"

 get_agent_version

@@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then
  EVAL_NOTE="$EVAL_NOTE-no-hint"
 fi

+if [ "$RUN_WITH_BROWSING" = true ]; then
+  EVAL_NOTE="$EVAL_NOTE-with-browsing"
+fi
+
 if [ -n "$EXP_NAME" ]; then
  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
 fi
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -411,3 +411,20 @@ def reset_logger_for_multiprocessing(
    )
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
+
+
+def update_llm_config_for_completions_logging(
+    llm_config: LLMConfig,
+    eval_output_dir: str,
+    instance_id: str,
+) -> LLMConfig:
+    """Update the LLM config for logging completions."""
+    if llm_config.log_completions:
+        llm_config.log_completions_folder = os.path.join(
+            eval_output_dir, 'llm_completions', instance_id
+        )
+        logger.info(
+            f'Logging LLM completions for instance {instance_id} to '
+            f'{llm_config.log_completions_folder}'
+        )
+    return llm_config
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -16,6 +16,7 @@ from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
+    BrowseInteractiveAction,
    CmdRunAction,
    FileEditAction,
    IPythonRunCellAction,
@@ -23,6 +24,7 @@ from openhands.events.action import (
 )
 from openhands.events.observation import (
    AgentDelegateObservation,
+    BrowserOutputObservation,
    CmdOutputObservation,
    FileEditObservation,
    IPythonRunCellObservation,
@@ -42,7 +44,7 @@ from openhands.utils.prompt import PromptManager


 class CodeActAgent(Agent):
-    VERSION = '2.1'
+    VERSION = '2.2'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -105,7 +107,7 @@ class CodeActAgent(Agent):
        if self.function_calling_active:
            # Function calling mode
            self.tools = codeact_function_calling.get_tools(
-                codeact_enable_browsing_delegate=self.config.codeact_enable_browsing_delegate,
+                codeact_enable_browsing=self.config.codeact_enable_browsing,
                codeact_enable_jupyter=self.config.codeact_enable_jupyter,
                codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
            )
@@ -142,10 +144,10 @@ class CodeActAgent(Agent):

        Args:
            action (Action): The action to convert. Can be one of:
-                - AgentDelegateAction: For delegating tasks to other agents
                - CmdRunAction: For executing bash commands
                - IPythonRunCellAction: For running IPython code
                - FileEditAction: For editing files
+                - BrowseInteractiveAction: For browsing the web
                - AgentFinishAction: For ending the interaction
                - MessageAction: For sending messages
            pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
@@ -169,6 +171,7 @@ class CodeActAgent(Agent):
                CmdRunAction,
                IPythonRunCellAction,
                FileEditAction,
+                BrowseInteractiveAction,
            ),
        ) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
            if self.function_calling_active:
@@ -192,6 +195,10 @@ class CodeActAgent(Agent):
                )
                return []
            else:
+                assert not isinstance(action, BrowseInteractiveAction), (
+                    'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
+                    + str(action)
+                )
                content = [TextContent(text=self.action_parser.action_to_str(action))]
                return [
                    Message(
@@ -266,6 +273,12 @@ class CodeActAgent(Agent):
        elif isinstance(obs, FileEditObservation):
            text = obs_prefix + truncate_content(str(obs), max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, BrowserOutputObservation):
+            text = obs.get_agent_obs_text()
+            message = Message(
+                role='user',
+                content=[TextContent(text=obs_prefix + text)],
+            )
        elif isinstance(obs, AgentDelegateObservation):
            text = obs_prefix + truncate_content(
                obs.outputs['content'] if 'content' in obs.outputs else '',
@@ -335,6 +348,7 @@ class CodeActAgent(Agent):
        }
        if self.function_calling_active:
            params['tools'] = self.tools
+            params['parallel_tool_calls'] = False
        else:
            params['stop'] = [
                '</execute_ipython>',
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -5,6 +5,7 @@ This is similar to the functionality of `CodeActResponseParser`.

 import json

+from browsergym.core.action.highlevel import HighLevelActionSet
 from litellm import (
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
@@ -16,6 +17,7 @@ from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
+    BrowseInteractiveAction,
    CmdRunAction,
    FileEditAction,
    IPythonRunCellAction,
@@ -272,24 +274,146 @@ StrReplaceEditorTool = ChatCompletionToolParam(
    ),
 )

-_BROWSER_DELEGATION = """Delegate the task to another browsing agent.
-The assistant should delegate the task if it needs to browse the Internet.
+# from browsergym/core/action/highlevel.py
+_browser_action_space = HighLevelActionSet(
+    subsets=['bid', 'nav'],
+    strict=False,  # less strict on the parsing of the actions
+    multiaction=True,  # enable to agent to take multiple actions at once
+)
+
+
+_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
+The following 15 functions are available. Nothing else is supported.
+
+goto(url: str)
+    Description: Navigate to a url.
+    Examples:
+        goto('http://www.example.com')
+
+go_back()
+    Description: Navigate to the previous page in history.
+    Examples:
+        go_back()
+
+go_forward()
+    Description: Navigate to the next page in history.
+    Examples:
+        go_forward()
+
+noop(wait_ms: float = 1000)
+    Description: Do nothing, and optionally wait for the given time (in milliseconds).
+    You can use this to get the current page content and/or wait for the page to load.
+    Examples:
+        noop()
+
+        noop(500)
+
+scroll(delta_x: float, delta_y: float)
+    Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
+    Examples:
+        select_option('a48', 'blue')
+
+        select_option('c48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Description: Click an element.
+    Examples:
+        click('a51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Description: Double click an element.
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Description: Hover over an element.
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'ControlOrMeta+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Description: Focus the matching element.
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Description: Clear the input field.
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
+    Examples:
+        upload_file('572', '/home/user/my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+
+Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
+More than 2-3 actions usually leads to failure or unexpected behavior. Example:
+fill('a12', 'example with "quotes"')
+click('a51')
+click('48', button='middle', modifiers=['Shift'])
 """

-BrowserDelegationTool = ChatCompletionToolParam(
+for _, action in _browser_action_space.action_set.items():
+    assert (
+        action.signature in _BROWSER_DESCRIPTION
+    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
+    assert (
+        action.description in _BROWSER_DESCRIPTION
+    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
+
+BrowserTool = ChatCompletionToolParam(
    type='function',
    function=ChatCompletionToolParamFunctionChunk(
-        name='delegate_to_browsing_agent',
-        description=_BROWSER_DELEGATION,
+        name='browser',
+        description=_BROWSER_DESCRIPTION,
        parameters={
            'type': 'object',
            'properties': {
-                'task': {
+                'code': {
                    'type': 'string',
-                    'description': 'The task for the browsing agent to execute. It should include all the necessary context and specify what information the browsing agent should return.',
-                },
+                    'description': 'The Python code that interacts with the browser.',
+                }
            },
-            'required': ['task'],
+            'required': ['code'],
        },
    ),
 )
@@ -357,6 +481,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                    f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
                )
                action = IPythonRunCellAction(code=code, include_extra=False)
+            elif tool_call.function.name == 'browser':
+                action = BrowseInteractiveAction(browser_actions=arguments['code'])
            else:
                raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')

@@ -381,13 +507,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:


 def get_tools(
-    codeact_enable_browsing_delegate: bool = False,
+    codeact_enable_browsing: bool = False,
    codeact_enable_llm_editor: bool = False,
    codeact_enable_jupyter: bool = False,
 ) -> list[ChatCompletionToolParam]:
    tools = [CmdRunTool, FinishTool]
-    if codeact_enable_browsing_delegate:
-        tools.append(BrowserDelegationTool)
+    if codeact_enable_browsing:
+        tools.append(BrowserTool)
    if codeact_enable_jupyter:
        tools.append(IPythonTool)
    if codeact_enable_llm_editor:
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -9,7 +9,7 @@ class AgentConfig:

    Attributes:
        function_calling: Whether function calling is enabled. Default is True.
-        codeact_enable_browsing_delegate: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
+        codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
        codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
        codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
        micro_agent_name: The name of the micro agent to use for this agent.
@@ -19,7 +19,7 @@ class AgentConfig:
    """

    function_calling: bool = True
-    codeact_enable_browsing_delegate: bool = True
+    codeact_enable_browsing: bool = True
    codeact_enable_llm_editor: bool = False
    codeact_enable_jupyter: bool = True
    micro_agent_name: str | None = None
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -1,5 +1,7 @@
 from dataclasses import dataclass, field

+from browsergym.utils.obs import flatten_axtree_to_str
+
 from openhands.core.schema import ObservationType
 from openhands.events.observation.observation import Observation

@@ -29,7 +31,7 @@ class BrowserOutputObservation(Observation):
        return 'Visited ' + self.url

    def __str__(self) -> str:
-        return (
+        ret = (
            '**BrowserOutputObservation**\n'
            f'URL: {self.url}\n'
            f'Error: {self.error}\n'
@@ -38,5 +40,47 @@ class BrowserOutputObservation(Observation):
            f'Last browser action: {self.last_browser_action}\n'
            f'Last browser action error: {self.last_browser_action_error}\n'
            f'Focused element bid: {self.focused_element_bid}\n'
-            f'CONTENT: {self.content}\n'
+            f'Content: {self.content}\n'
        )
+        ret += '--- Agent Observation ---\n'
+        ret += self.get_agent_obs_text()
+        return ret
+
+    def get_agent_obs_text(self) -> str:
+        """Get a concise text that will be shown to the agent."""
+        text = f'[Current URL: {self.url}]\n'
+        text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
+        if self.error:
+            text += (
+                '================ BEGIN error message ===============\n'
+                'The following error occurred when executing the last action:\n'
+                f'{self.last_browser_action_error}\n'
+                '================ END error message ===============\n'
+            )
+        else:
+            text += '[Action executed successfully.]\n'
+
+        try:
+            # We do not filter visible only here because we want to show the full content
+            # of the web page to the agent for simplicity.
+            # FIXME: handle the case when the web page is too large
+            cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
+            text += (
+                f'============== BEGIN accessibility tree ==============\n'
+                f'{cur_axtree_txt}\n'
+                f'============== END accessibility tree ==============\n'
+            )
+        except Exception as e:
+            text += f'\n[Error encountered when processing the accessibility tree: {e}]'
+        return text
+
+    def get_axtree_str(self, filter_visible_only: bool = False) -> str:
+        cur_axtree_txt = flatten_axtree_to_str(
+            self.axtree_object,
+            extra_properties=self.extra_element_properties,
+            with_clickable=True,
+            skip_generic=False,
+            filter_visible_only=filter_visible_only,
+        )
+        self._axtree_str = cur_axtree_txt
+        return cur_axtree_txt
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@@ -81,7 +81,10 @@ class BrowserEnv:
                raise ValueError(
                    f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
                )
-            env = gym.make(self.browsergym_eval_env)
+            env = gym.make(
+                self.browsergym_eval_env,
+                tags_to_mark='all',
+            )
        else:
            env = gym.make(
                'browsergym/openended',
@@ -89,6 +92,7 @@ class BrowserEnv:
                wait_for_user_message=False,
                headless=True,
                disable_env_checker=True,
+                tags_to_mark='all',
            )

        obs, info = env.reset()