feat(agent, CodeAct 2.2): native CodeAct support for Browsing (#4667)

Co-authored-by: tofarr <tofarr@gmail.com>
This commit is contained in:
Xingyao Wang
2024-11-04 10:27:27 -06:00
committed by GitHub
parent f0af90bff3
commit 966da7b7c8
12 changed files with 346 additions and 55 deletions

View File

@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
update_llm_config_for_completions_logging,
) )
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
@@ -55,18 +56,14 @@ def get_config(
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
) )
if metadata.llm_config.log_completions: config.set_llm_config(
metadata.llm_config.log_completions_folder = os.path.join( update_llm_config_for_completions_logging(
metadata.eval_output_dir, 'llm_completions', instance_id metadata.llm_config, metadata.eval_output_dir, instance_id
) )
logger.info( )
f'Logging LLM completions for instance {instance_id} to '
f'{metadata.llm_config.log_completions_folder}'
)
config.set_llm_config(metadata.llm_config)
agent_config = AgentConfig( agent_config = AgentConfig(
codeact_enable_jupyter=True, codeact_enable_jupyter=True,
codeact_enable_browsing_delegate=True, codeact_enable_browsing=True,
codeact_enable_llm_editor=False, codeact_enable_llm_editor=False,
) )
config.set_agent_config(agent_config) config.set_agent_config(agent_config)

View File

@@ -0,0 +1,44 @@
from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
from openhands.events.action import AgentFinishAction, MessageAction
from openhands.events.event import Event
from openhands.events.observation import AgentDelegateObservation
from openhands.runtime.base import Runtime
class Test(BaseIntegrationTest):
INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
@classmethod
def initialize_runtime(cls, runtime: Runtime) -> None:
pass
@classmethod
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
# check if the "The answer is OpenHands is all you need!" is in any message
message_actions = [
event
for event in histories
if isinstance(
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
)
]
for event in message_actions:
if isinstance(event, AgentDelegateObservation):
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
elif isinstance(event, MessageAction):
content = event.content
else:
raise ValueError(f'Unknown event type: {type(event)}')
if (
'non-commercial' in content
or 'MIT' in content
or 'Apache 2.0' in content
):
return TestResult(success=True)
return TestResult(
success=False,
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
)

View File

@@ -10,10 +10,12 @@ import pandas as pd
from evaluation.utils.shared import ( from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
codeact_user_response,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
update_llm_config_for_completions_logging,
) )
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
@@ -29,7 +31,10 @@ from openhands.events.action import (
CmdRunAction, CmdRunAction,
MessageAction, MessageAction,
) )
from openhands.events.observation import CmdOutputObservation from openhands.events.observation import (
BrowserOutputObservation,
CmdOutputObservation,
)
from openhands.runtime.base import Runtime from openhands.runtime.base import Runtime
from openhands.runtime.browser.browser_env import ( from openhands.runtime.browser.browser_env import (
BROWSER_EVAL_GET_GOAL_ACTION, BROWSER_EVAL_GET_GOAL_ACTION,
@@ -37,7 +42,11 @@ from openhands.runtime.browser.browser_env import (
) )
from openhands.utils.async_utils import call_async_from_sync from openhands.utils.async_utils import call_async_from_sync
SUPPORTED_AGENT_CLS = {'BrowsingAgent'} SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
}
def get_config( def get_config(
@@ -47,25 +56,32 @@ def get_config(
config = AppConfig( config = AppConfig(
default_agent=metadata.agent_class, default_agent=metadata.agent_class,
run_as_openhands=False, run_as_openhands=False,
runtime='eventstream', runtime=os.environ.get('RUNTIME', 'eventstream'),
max_iterations=metadata.max_iterations, max_iterations=metadata.max_iterations,
sandbox=SandboxConfig( sandbox=SandboxConfig(
base_container_image='xingyaoww/od-eval-miniwob:v1.0', base_container_image='xingyaoww/od-eval-miniwob:v1.0',
enable_auto_lint=True, enable_auto_lint=True,
use_host_network=False, use_host_network=False,
browsergym_eval_env=env_id, browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
), ),
# do not mount workspace # do not mount workspace
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, env_id
)
)
return config return config
def initialize_runtime( def initialize_runtime(
runtime: Runtime, runtime: Runtime,
) -> str: ) -> tuple[str, BrowserOutputObservation]:
"""Initialize the runtime for the agent. """Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent. This function is called before the runtime is used to run the agent.
@@ -85,8 +101,14 @@ def initialize_runtime(
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) logger.info(obs, extra={'msg_type': 'OBSERVATION'})
goal = obs.content goal = obs.content
# Run noop to get the initial browser observation (e.g., the page URL & content)
action = BrowseInteractiveAction(browser_actions='noop(1000)')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
return goal return goal, obs
def complete_runtime( def complete_runtime(
@@ -117,7 +139,7 @@ def process_instance(
metadata: EvalMetadata, metadata: EvalMetadata,
reset_logger: bool = True, reset_logger: bool = True,
) -> EvalOutput: ) -> EvalOutput:
env_id = instance.id env_id = instance.instance_id
config = get_config(metadata, env_id) config = get_config(metadata, env_id)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
@@ -129,7 +151,12 @@ def process_instance(
runtime = create_runtime(config) runtime = create_runtime(config)
call_async_from_sync(runtime.connect) call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime) task_str, obs = initialize_runtime(runtime)
task_str += (
f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}'
)
state: State | None = asyncio.run( state: State | None = asyncio.run(
run_controller( run_controller(
config=config, config=config,
@@ -137,6 +164,9 @@ def process_instance(
content=task_str content=task_str
), # take output from initialize_runtime ), # take output from initialize_runtime
runtime=runtime, runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
metadata.agent_class
],
) )
) )
@@ -159,7 +189,7 @@ def process_instance(
return_val = complete_runtime(runtime) return_val = complete_runtime(runtime)
logger.info(f'Return value from complete_runtime: {return_val}') logger.info(f'Return value from complete_runtime: {return_val}')
reward = max(return_val['rewards']) reward = max(return_val['rewards'], default=0)
# history is now available as a stream of events, rather than list of pairs of (Action, Observation) # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here # for compatibility with the existing output format, we can remake the pairs here

View File

@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
update_llm_config_for_completions_logging,
) )
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
@@ -76,15 +77,13 @@ def get_config(
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(
if metadata.llm_config.log_completions: update_llm_config_for_completions_logging(
metadata.llm_config.log_completions_folder = os.path.join( metadata.llm_config,
metadata.eval_output_dir, 'llm_completions', instance_id metadata.eval_output_dir,
) instance_id,
logger.info(
f'Logging LLM completions for instance {instance_id} to '
f'{metadata.llm_config.log_completions_folder}'
) )
)
return config return config

View File

@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
update_llm_config_for_completions_logging,
) )
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
@@ -40,6 +41,7 @@ from openhands.utils.async_utils import call_async_from_sync
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response, 'CodeActAgent': codeact_user_response,
@@ -88,6 +90,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
'5. Think about edgecases and make sure your fix handles them as well\n' '5. Think about edgecases and make sure your fix handles them as well\n'
"Your thinking should be thorough and so it's fine if it's very long.\n" "Your thinking should be thorough and so it's fine if it's very long.\n"
) )
if RUN_WITH_BROWSING:
instruction += (
'<IMPORTANT!>\n'
'You SHOULD NEVER attempt to browse the web. '
'</IMPORTANT!>\n'
)
return instruction return instruction
@@ -142,18 +151,14 @@ def get_config(
workspace_base=None, workspace_base=None,
workspace_mount_path=None, workspace_mount_path=None,
) )
if metadata.llm_config.log_completions: config.set_llm_config(
metadata.llm_config.log_completions_folder = os.path.join( update_llm_config_for_completions_logging(
metadata.eval_output_dir, 'llm_completions', instance['instance_id'] metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
) )
logger.info( )
f'Logging LLM completions for instance {instance["instance_id"]} to '
f'{metadata.llm_config.log_completions_folder}'
)
config.set_llm_config(metadata.llm_config)
agent_config = AgentConfig( agent_config = AgentConfig(
codeact_enable_jupyter=False, codeact_enable_jupyter=False,
codeact_enable_browsing_delegate=False, codeact_enable_browsing=RUN_WITH_BROWSING,
codeact_enable_llm_editor=False, codeact_enable_llm_editor=False,
) )
config.set_agent_config(agent_config) config.set_agent_config(agent_config)

View File

@@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
USE_INSTANCE_IMAGE=true USE_INSTANCE_IMAGE=true
fi fi
if [ -z "$RUN_WITH_BROWSING" ]; then
echo "RUN_WITH_BROWSING not specified, use default false"
RUN_WITH_BROWSING=false
fi
if [ -z "$DATASET" ]; then if [ -z "$DATASET" ]; then
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite" echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
@@ -47,6 +52,8 @@ fi
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE" echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
get_agent_version get_agent_version
@@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint" EVAL_NOTE="$EVAL_NOTE-no-hint"
fi fi
if [ "$RUN_WITH_BROWSING" = true ]; then
EVAL_NOTE="$EVAL_NOTE-with-browsing"
fi
if [ -n "$EXP_NAME" ]; then if [ -n "$EXP_NAME" ]; then
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME" EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
fi fi

View File

@@ -411,3 +411,20 @@ def reset_logger_for_multiprocessing(
) )
file_handler.setLevel(logging.INFO) file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler) logger.addHandler(file_handler)
def update_llm_config_for_completions_logging(
llm_config: LLMConfig,
eval_output_dir: str,
instance_id: str,
) -> LLMConfig:
"""Update the LLM config for logging completions."""
if llm_config.log_completions:
llm_config.log_completions_folder = os.path.join(
eval_output_dir, 'llm_completions', instance_id
)
logger.info(
f'Logging LLM completions for instance {instance_id} to '
f'{llm_config.log_completions_folder}'
)
return llm_config

View File

@@ -16,6 +16,7 @@ from openhands.events.action import (
Action, Action,
AgentDelegateAction, AgentDelegateAction,
AgentFinishAction, AgentFinishAction,
BrowseInteractiveAction,
CmdRunAction, CmdRunAction,
FileEditAction, FileEditAction,
IPythonRunCellAction, IPythonRunCellAction,
@@ -23,6 +24,7 @@ from openhands.events.action import (
) )
from openhands.events.observation import ( from openhands.events.observation import (
AgentDelegateObservation, AgentDelegateObservation,
BrowserOutputObservation,
CmdOutputObservation, CmdOutputObservation,
FileEditObservation, FileEditObservation,
IPythonRunCellObservation, IPythonRunCellObservation,
@@ -42,7 +44,7 @@ from openhands.utils.prompt import PromptManager
class CodeActAgent(Agent): class CodeActAgent(Agent):
VERSION = '2.1' VERSION = '2.2'
""" """
The Code Act Agent is a minimalist agent. The Code Act Agent is a minimalist agent.
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step. The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -105,7 +107,7 @@ class CodeActAgent(Agent):
if self.function_calling_active: if self.function_calling_active:
# Function calling mode # Function calling mode
self.tools = codeact_function_calling.get_tools( self.tools = codeact_function_calling.get_tools(
codeact_enable_browsing_delegate=self.config.codeact_enable_browsing_delegate, codeact_enable_browsing=self.config.codeact_enable_browsing,
codeact_enable_jupyter=self.config.codeact_enable_jupyter, codeact_enable_jupyter=self.config.codeact_enable_jupyter,
codeact_enable_llm_editor=self.config.codeact_enable_llm_editor, codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
) )
@@ -142,10 +144,10 @@ class CodeActAgent(Agent):
Args: Args:
action (Action): The action to convert. Can be one of: action (Action): The action to convert. Can be one of:
- AgentDelegateAction: For delegating tasks to other agents
- CmdRunAction: For executing bash commands - CmdRunAction: For executing bash commands
- IPythonRunCellAction: For running IPython code - IPythonRunCellAction: For running IPython code
- FileEditAction: For editing files - FileEditAction: For editing files
- BrowseInteractiveAction: For browsing the web
- AgentFinishAction: For ending the interaction - AgentFinishAction: For ending the interaction
- MessageAction: For sending messages - MessageAction: For sending messages
pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
@@ -169,6 +171,7 @@ class CodeActAgent(Agent):
CmdRunAction, CmdRunAction,
IPythonRunCellAction, IPythonRunCellAction,
FileEditAction, FileEditAction,
BrowseInteractiveAction,
), ),
) or (isinstance(action, AgentFinishAction) and action.source == 'agent'): ) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
if self.function_calling_active: if self.function_calling_active:
@@ -192,6 +195,10 @@ class CodeActAgent(Agent):
) )
return [] return []
else: else:
assert not isinstance(action, BrowseInteractiveAction), (
'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
+ str(action)
)
content = [TextContent(text=self.action_parser.action_to_str(action))] content = [TextContent(text=self.action_parser.action_to_str(action))]
return [ return [
Message( Message(
@@ -266,6 +273,12 @@ class CodeActAgent(Agent):
elif isinstance(obs, FileEditObservation): elif isinstance(obs, FileEditObservation):
text = obs_prefix + truncate_content(str(obs), max_message_chars) text = obs_prefix + truncate_content(str(obs), max_message_chars)
message = Message(role='user', content=[TextContent(text=text)]) message = Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, BrowserOutputObservation):
text = obs.get_agent_obs_text()
message = Message(
role='user',
content=[TextContent(text=obs_prefix + text)],
)
elif isinstance(obs, AgentDelegateObservation): elif isinstance(obs, AgentDelegateObservation):
text = obs_prefix + truncate_content( text = obs_prefix + truncate_content(
obs.outputs['content'] if 'content' in obs.outputs else '', obs.outputs['content'] if 'content' in obs.outputs else '',
@@ -335,6 +348,7 @@ class CodeActAgent(Agent):
} }
if self.function_calling_active: if self.function_calling_active:
params['tools'] = self.tools params['tools'] = self.tools
params['parallel_tool_calls'] = False
else: else:
params['stop'] = [ params['stop'] = [
'</execute_ipython>', '</execute_ipython>',

View File

@@ -5,6 +5,7 @@ This is similar to the functionality of `CodeActResponseParser`.
import json import json
from browsergym.core.action.highlevel import HighLevelActionSet
from litellm import ( from litellm import (
ChatCompletionToolParam, ChatCompletionToolParam,
ChatCompletionToolParamFunctionChunk, ChatCompletionToolParamFunctionChunk,
@@ -16,6 +17,7 @@ from openhands.events.action import (
Action, Action,
AgentDelegateAction, AgentDelegateAction,
AgentFinishAction, AgentFinishAction,
BrowseInteractiveAction,
CmdRunAction, CmdRunAction,
FileEditAction, FileEditAction,
IPythonRunCellAction, IPythonRunCellAction,
@@ -272,24 +274,146 @@ StrReplaceEditorTool = ChatCompletionToolParam(
), ),
) )
_BROWSER_DELEGATION = """Delegate the task to another browsing agent. # from browsergym/core/action/highlevel.py
The assistant should delegate the task if it needs to browse the Internet. _browser_action_space = HighLevelActionSet(
subsets=['bid', 'nav'],
strict=False, # less strict on the parsing of the actions
multiaction=True, # enable to agent to take multiple actions at once
)
_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
The following 15 functions are available. Nothing else is supported.
goto(url: str)
Description: Navigate to a url.
Examples:
goto('http://www.example.com')
go_back()
Description: Navigate to the previous page in history.
Examples:
go_back()
go_forward()
Description: Navigate to the next page in history.
Examples:
go_forward()
noop(wait_ms: float = 1000)
Description: Do nothing, and optionally wait for the given time (in milliseconds).
You can use this to get the current page content and/or wait for the page to load.
Examples:
noop()
noop(500)
scroll(delta_x: float, delta_y: float)
Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
Examples:
scroll(0, 200)
scroll(-50.2, -100.5)
fill(bid: str, value: str)
Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
Examples:
fill('237', 'example value')
fill('45', 'multi-line\nexample')
fill('a12', 'example with "quotes"')
select_option(bid: str, options: str | list[str])
Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
Examples:
select_option('a48', 'blue')
select_option('c48', ['red', 'green', 'blue'])
click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
Description: Click an element.
Examples:
click('a51')
click('b22', button='right')
click('48', button='middle', modifiers=['Shift'])
dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
Description: Double click an element.
Examples:
dblclick('12')
dblclick('ca42', button='right')
dblclick('178', button='middle', modifiers=['Shift'])
hover(bid: str)
Description: Hover over an element.
Examples:
hover('b8')
press(bid: str, key_comb: str)
Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
Examples:
press('88', 'Backspace')
press('a26', 'ControlOrMeta+a')
press('a61', 'Meta+Shift+t')
focus(bid: str)
Description: Focus the matching element.
Examples:
focus('b455')
clear(bid: str)
Description: Clear the input field.
Examples:
clear('996')
drag_and_drop(from_bid: str, to_bid: str)
Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
Examples:
drag_and_drop('56', '498')
upload_file(bid: str, file: str | list[str])
Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
Examples:
upload_file('572', '/home/user/my_receipt.pdf')
upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
More than 2-3 actions usually leads to failure or unexpected behavior. Example:
fill('a12', 'example with "quotes"')
click('a51')
click('48', button='middle', modifiers=['Shift'])
""" """
BrowserDelegationTool = ChatCompletionToolParam( for _, action in _browser_action_space.action_set.items():
assert (
action.signature in _BROWSER_DESCRIPTION
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
assert (
action.description in _BROWSER_DESCRIPTION
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
BrowserTool = ChatCompletionToolParam(
type='function', type='function',
function=ChatCompletionToolParamFunctionChunk( function=ChatCompletionToolParamFunctionChunk(
name='delegate_to_browsing_agent', name='browser',
description=_BROWSER_DELEGATION, description=_BROWSER_DESCRIPTION,
parameters={ parameters={
'type': 'object', 'type': 'object',
'properties': { 'properties': {
'task': { 'code': {
'type': 'string', 'type': 'string',
'description': 'The task for the browsing agent to execute. It should include all the necessary context and specify what information the browsing agent should return.', 'description': 'The Python code that interacts with the browser.',
}, }
}, },
'required': ['task'], 'required': ['code'],
}, },
), ),
) )
@@ -357,6 +481,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
f'TOOL CALL: str_replace_editor -> file_editor with code: {code}' f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
) )
action = IPythonRunCellAction(code=code, include_extra=False) action = IPythonRunCellAction(code=code, include_extra=False)
elif tool_call.function.name == 'browser':
action = BrowseInteractiveAction(browser_actions=arguments['code'])
else: else:
raise RuntimeError(f'Unknown tool call: {tool_call.function.name}') raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
@@ -381,13 +507,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
def get_tools( def get_tools(
codeact_enable_browsing_delegate: bool = False, codeact_enable_browsing: bool = False,
codeact_enable_llm_editor: bool = False, codeact_enable_llm_editor: bool = False,
codeact_enable_jupyter: bool = False, codeact_enable_jupyter: bool = False,
) -> list[ChatCompletionToolParam]: ) -> list[ChatCompletionToolParam]:
tools = [CmdRunTool, FinishTool] tools = [CmdRunTool, FinishTool]
if codeact_enable_browsing_delegate: if codeact_enable_browsing:
tools.append(BrowserDelegationTool) tools.append(BrowserTool)
if codeact_enable_jupyter: if codeact_enable_jupyter:
tools.append(IPythonTool) tools.append(IPythonTool)
if codeact_enable_llm_editor: if codeact_enable_llm_editor:

View File

@@ -9,7 +9,7 @@ class AgentConfig:
Attributes: Attributes:
function_calling: Whether function calling is enabled. Default is True. function_calling: Whether function calling is enabled. Default is True.
codeact_enable_browsing_delegate: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling. codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling. codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False. codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
micro_agent_name: The name of the micro agent to use for this agent. micro_agent_name: The name of the micro agent to use for this agent.
@@ -19,7 +19,7 @@ class AgentConfig:
""" """
function_calling: bool = True function_calling: bool = True
codeact_enable_browsing_delegate: bool = True codeact_enable_browsing: bool = True
codeact_enable_llm_editor: bool = False codeact_enable_llm_editor: bool = False
codeact_enable_jupyter: bool = True codeact_enable_jupyter: bool = True
micro_agent_name: str | None = None micro_agent_name: str | None = None

View File

@@ -1,5 +1,7 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from browsergym.utils.obs import flatten_axtree_to_str
from openhands.core.schema import ObservationType from openhands.core.schema import ObservationType
from openhands.events.observation.observation import Observation from openhands.events.observation.observation import Observation
@@ -29,7 +31,7 @@ class BrowserOutputObservation(Observation):
return 'Visited ' + self.url return 'Visited ' + self.url
def __str__(self) -> str: def __str__(self) -> str:
return ( ret = (
'**BrowserOutputObservation**\n' '**BrowserOutputObservation**\n'
f'URL: {self.url}\n' f'URL: {self.url}\n'
f'Error: {self.error}\n' f'Error: {self.error}\n'
@@ -38,5 +40,47 @@ class BrowserOutputObservation(Observation):
f'Last browser action: {self.last_browser_action}\n' f'Last browser action: {self.last_browser_action}\n'
f'Last browser action error: {self.last_browser_action_error}\n' f'Last browser action error: {self.last_browser_action_error}\n'
f'Focused element bid: {self.focused_element_bid}\n' f'Focused element bid: {self.focused_element_bid}\n'
f'CONTENT: {self.content}\n' f'Content: {self.content}\n'
) )
ret += '--- Agent Observation ---\n'
ret += self.get_agent_obs_text()
return ret
def get_agent_obs_text(self) -> str:
"""Get a concise text that will be shown to the agent."""
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
cur_axtree_txt = flatten_axtree_to_str(
self.axtree_object,
extra_properties=self.extra_element_properties,
with_clickable=True,
skip_generic=False,
filter_visible_only=filter_visible_only,
)
self._axtree_str = cur_axtree_txt
return cur_axtree_txt

View File

@@ -81,7 +81,10 @@ class BrowserEnv:
raise ValueError( raise ValueError(
f'Unsupported browsergym eval env: {self.browsergym_eval_env}' f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
) )
env = gym.make(self.browsergym_eval_env) env = gym.make(
self.browsergym_eval_env,
tags_to_mark='all',
)
else: else:
env = gym.make( env = gym.make(
'browsergym/openended', 'browsergym/openended',
@@ -89,6 +92,7 @@ class BrowserEnv:
wait_for_user_message=False, wait_for_user_message=False,
headless=True, headless=True,
disable_env_checker=True, disable_env_checker=True,
tags_to_mark='all',
) )
obs, info = env.reset() obs, info = env.reset()