mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 23:08:04 -05:00
feat(agent, CodeAct 2.2): native CodeAct support for Browsing (#4667)
Co-authored-by: tofarr <tofarr@gmail.com>
This commit is contained in:
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
update_llm_config_for_completions_logging,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
@@ -55,18 +56,14 @@ def get_config(
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
if metadata.llm_config.log_completions:
|
||||
metadata.llm_config.log_completions_folder = os.path.join(
|
||||
metadata.eval_output_dir, 'llm_completions', instance_id
|
||||
config.set_llm_config(
|
||||
update_llm_config_for_completions_logging(
|
||||
metadata.llm_config, metadata.eval_output_dir, instance_id
|
||||
)
|
||||
logger.info(
|
||||
f'Logging LLM completions for instance {instance_id} to '
|
||||
f'{metadata.llm_config.log_completions_folder}'
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
)
|
||||
agent_config = AgentConfig(
|
||||
codeact_enable_jupyter=True,
|
||||
codeact_enable_browsing_delegate=True,
|
||||
codeact_enable_browsing=True,
|
||||
codeact_enable_llm_editor=False,
|
||||
)
|
||||
config.set_agent_config(agent_config)
|
||||
|
||||
44
evaluation/integration_tests/tests/t06_github_pr_browsing.py
Normal file
44
evaluation/integration_tests/tests/t06_github_pr_browsing.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
|
||||
from openhands.events.action import AgentFinishAction, MessageAction
|
||||
from openhands.events.event import Event
|
||||
from openhands.events.observation import AgentDelegateObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
|
||||
|
||||
class Test(BaseIntegrationTest):
|
||||
INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
|
||||
|
||||
@classmethod
|
||||
def initialize_runtime(cls, runtime: Runtime) -> None:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
|
||||
# check if the "The answer is OpenHands is all you need!" is in any message
|
||||
message_actions = [
|
||||
event
|
||||
for event in histories
|
||||
if isinstance(
|
||||
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
|
||||
)
|
||||
]
|
||||
for event in message_actions:
|
||||
if isinstance(event, AgentDelegateObservation):
|
||||
content = event.content
|
||||
elif isinstance(event, AgentFinishAction):
|
||||
content = event.outputs.get('content', '')
|
||||
elif isinstance(event, MessageAction):
|
||||
content = event.content
|
||||
else:
|
||||
raise ValueError(f'Unknown event type: {type(event)}')
|
||||
|
||||
if (
|
||||
'non-commercial' in content
|
||||
or 'MIT' in content
|
||||
or 'Apache 2.0' in content
|
||||
):
|
||||
return TestResult(success=True)
|
||||
return TestResult(
|
||||
success=False,
|
||||
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
|
||||
)
|
||||
@@ -10,10 +10,12 @@ import pandas as pd
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
EvalOutput,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
update_llm_config_for_completions_logging,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
@@ -29,7 +31,10 @@ from openhands.events.action import (
|
||||
CmdRunAction,
|
||||
MessageAction,
|
||||
)
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.events.observation import (
|
||||
BrowserOutputObservation,
|
||||
CmdOutputObservation,
|
||||
)
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.runtime.browser.browser_env import (
|
||||
BROWSER_EVAL_GET_GOAL_ACTION,
|
||||
@@ -37,7 +42,11 @@ from openhands.runtime.browser.browser_env import (
|
||||
)
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
}
|
||||
|
||||
|
||||
def get_config(
|
||||
@@ -47,25 +56,32 @@ def get_config(
|
||||
config = AppConfig(
|
||||
default_agent=metadata.agent_class,
|
||||
run_as_openhands=False,
|
||||
runtime='eventstream',
|
||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||
max_iterations=metadata.max_iterations,
|
||||
sandbox=SandboxConfig(
|
||||
base_container_image='xingyaoww/od-eval-miniwob:v1.0',
|
||||
enable_auto_lint=True,
|
||||
use_host_network=False,
|
||||
browsergym_eval_env=env_id,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
keep_remote_runtime_alive=False,
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
config.set_llm_config(
|
||||
update_llm_config_for_completions_logging(
|
||||
metadata.llm_config, metadata.eval_output_dir, env_id
|
||||
)
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
def initialize_runtime(
|
||||
runtime: Runtime,
|
||||
) -> str:
|
||||
) -> tuple[str, BrowserOutputObservation]:
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
@@ -85,8 +101,14 @@ def initialize_runtime(
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
goal = obs.content
|
||||
|
||||
# Run noop to get the initial browser observation (e.g., the page URL & content)
|
||||
action = BrowseInteractiveAction(browser_actions='noop(1000)')
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||
return goal
|
||||
return goal, obs
|
||||
|
||||
|
||||
def complete_runtime(
|
||||
@@ -117,7 +139,7 @@ def process_instance(
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
) -> EvalOutput:
|
||||
env_id = instance.id
|
||||
env_id = instance.instance_id
|
||||
config = get_config(metadata, env_id)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
@@ -129,7 +151,12 @@ def process_instance(
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
task_str = initialize_runtime(runtime)
|
||||
task_str, obs = initialize_runtime(runtime)
|
||||
|
||||
task_str += (
|
||||
f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}'
|
||||
)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
@@ -137,6 +164,9 @@ def process_instance(
|
||||
content=task_str
|
||||
), # take output from initialize_runtime
|
||||
runtime=runtime,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
metadata.agent_class
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
@@ -159,7 +189,7 @@ def process_instance(
|
||||
|
||||
return_val = complete_runtime(runtime)
|
||||
logger.info(f'Return value from complete_runtime: {return_val}')
|
||||
reward = max(return_val['rewards'])
|
||||
reward = max(return_val['rewards'], default=0)
|
||||
|
||||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
||||
# for compatibility with the existing output format, we can remake the pairs here
|
||||
|
||||
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
update_llm_config_for_completions_logging,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
@@ -76,15 +77,13 @@ def get_config(
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
if metadata.llm_config.log_completions:
|
||||
metadata.llm_config.log_completions_folder = os.path.join(
|
||||
metadata.eval_output_dir, 'llm_completions', instance_id
|
||||
)
|
||||
logger.info(
|
||||
f'Logging LLM completions for instance {instance_id} to '
|
||||
f'{metadata.llm_config.log_completions_folder}'
|
||||
config.set_llm_config(
|
||||
update_llm_config_for_completions_logging(
|
||||
metadata.llm_config,
|
||||
metadata.eval_output_dir,
|
||||
instance_id,
|
||||
)
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
|
||||
prepare_dataset,
|
||||
reset_logger_for_multiprocessing,
|
||||
run_evaluation,
|
||||
update_llm_config_for_completions_logging,
|
||||
)
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
@@ -40,6 +41,7 @@ from openhands.utils.async_utils import call_async_from_sync
|
||||
|
||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
||||
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
|
||||
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
@@ -88,6 +90,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
||||
'5. Think about edgecases and make sure your fix handles them as well\n'
|
||||
"Your thinking should be thorough and so it's fine if it's very long.\n"
|
||||
)
|
||||
|
||||
if RUN_WITH_BROWSING:
|
||||
instruction += (
|
||||
'<IMPORTANT!>\n'
|
||||
'You SHOULD NEVER attempt to browse the web. '
|
||||
'</IMPORTANT!>\n'
|
||||
)
|
||||
return instruction
|
||||
|
||||
|
||||
@@ -142,18 +151,14 @@ def get_config(
|
||||
workspace_base=None,
|
||||
workspace_mount_path=None,
|
||||
)
|
||||
if metadata.llm_config.log_completions:
|
||||
metadata.llm_config.log_completions_folder = os.path.join(
|
||||
metadata.eval_output_dir, 'llm_completions', instance['instance_id']
|
||||
config.set_llm_config(
|
||||
update_llm_config_for_completions_logging(
|
||||
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
||||
)
|
||||
logger.info(
|
||||
f'Logging LLM completions for instance {instance["instance_id"]} to '
|
||||
f'{metadata.llm_config.log_completions_folder}'
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
)
|
||||
agent_config = AgentConfig(
|
||||
codeact_enable_jupyter=False,
|
||||
codeact_enable_browsing_delegate=False,
|
||||
codeact_enable_browsing=RUN_WITH_BROWSING,
|
||||
codeact_enable_llm_editor=False,
|
||||
)
|
||||
config.set_agent_config(agent_config)
|
||||
|
||||
@@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
|
||||
USE_INSTANCE_IMAGE=true
|
||||
fi
|
||||
|
||||
if [ -z "$RUN_WITH_BROWSING" ]; then
|
||||
echo "RUN_WITH_BROWSING not specified, use default false"
|
||||
RUN_WITH_BROWSING=false
|
||||
fi
|
||||
|
||||
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
|
||||
@@ -47,6 +52,8 @@ fi
|
||||
|
||||
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
|
||||
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||
|
||||
get_agent_version
|
||||
|
||||
@@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||
fi
|
||||
|
||||
if [ "$RUN_WITH_BROWSING" = true ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-with-browsing"
|
||||
fi
|
||||
|
||||
if [ -n "$EXP_NAME" ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
|
||||
fi
|
||||
|
||||
@@ -411,3 +411,20 @@ def reset_logger_for_multiprocessing(
|
||||
)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
|
||||
def update_llm_config_for_completions_logging(
|
||||
llm_config: LLMConfig,
|
||||
eval_output_dir: str,
|
||||
instance_id: str,
|
||||
) -> LLMConfig:
|
||||
"""Update the LLM config for logging completions."""
|
||||
if llm_config.log_completions:
|
||||
llm_config.log_completions_folder = os.path.join(
|
||||
eval_output_dir, 'llm_completions', instance_id
|
||||
)
|
||||
logger.info(
|
||||
f'Logging LLM completions for instance {instance_id} to '
|
||||
f'{llm_config.log_completions_folder}'
|
||||
)
|
||||
return llm_config
|
||||
|
||||
@@ -16,6 +16,7 @@ from openhands.events.action import (
|
||||
Action,
|
||||
AgentDelegateAction,
|
||||
AgentFinishAction,
|
||||
BrowseInteractiveAction,
|
||||
CmdRunAction,
|
||||
FileEditAction,
|
||||
IPythonRunCellAction,
|
||||
@@ -23,6 +24,7 @@ from openhands.events.action import (
|
||||
)
|
||||
from openhands.events.observation import (
|
||||
AgentDelegateObservation,
|
||||
BrowserOutputObservation,
|
||||
CmdOutputObservation,
|
||||
FileEditObservation,
|
||||
IPythonRunCellObservation,
|
||||
@@ -42,7 +44,7 @@ from openhands.utils.prompt import PromptManager
|
||||
|
||||
|
||||
class CodeActAgent(Agent):
|
||||
VERSION = '2.1'
|
||||
VERSION = '2.2'
|
||||
"""
|
||||
The Code Act Agent is a minimalist agent.
|
||||
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
|
||||
@@ -105,7 +107,7 @@ class CodeActAgent(Agent):
|
||||
if self.function_calling_active:
|
||||
# Function calling mode
|
||||
self.tools = codeact_function_calling.get_tools(
|
||||
codeact_enable_browsing_delegate=self.config.codeact_enable_browsing_delegate,
|
||||
codeact_enable_browsing=self.config.codeact_enable_browsing,
|
||||
codeact_enable_jupyter=self.config.codeact_enable_jupyter,
|
||||
codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
|
||||
)
|
||||
@@ -142,10 +144,10 @@ class CodeActAgent(Agent):
|
||||
|
||||
Args:
|
||||
action (Action): The action to convert. Can be one of:
|
||||
- AgentDelegateAction: For delegating tasks to other agents
|
||||
- CmdRunAction: For executing bash commands
|
||||
- IPythonRunCellAction: For running IPython code
|
||||
- FileEditAction: For editing files
|
||||
- BrowseInteractiveAction: For browsing the web
|
||||
- AgentFinishAction: For ending the interaction
|
||||
- MessageAction: For sending messages
|
||||
pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
|
||||
@@ -169,6 +171,7 @@ class CodeActAgent(Agent):
|
||||
CmdRunAction,
|
||||
IPythonRunCellAction,
|
||||
FileEditAction,
|
||||
BrowseInteractiveAction,
|
||||
),
|
||||
) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
|
||||
if self.function_calling_active:
|
||||
@@ -192,6 +195,10 @@ class CodeActAgent(Agent):
|
||||
)
|
||||
return []
|
||||
else:
|
||||
assert not isinstance(action, BrowseInteractiveAction), (
|
||||
'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
|
||||
+ str(action)
|
||||
)
|
||||
content = [TextContent(text=self.action_parser.action_to_str(action))]
|
||||
return [
|
||||
Message(
|
||||
@@ -266,6 +273,12 @@ class CodeActAgent(Agent):
|
||||
elif isinstance(obs, FileEditObservation):
|
||||
text = obs_prefix + truncate_content(str(obs), max_message_chars)
|
||||
message = Message(role='user', content=[TextContent(text=text)])
|
||||
elif isinstance(obs, BrowserOutputObservation):
|
||||
text = obs.get_agent_obs_text()
|
||||
message = Message(
|
||||
role='user',
|
||||
content=[TextContent(text=obs_prefix + text)],
|
||||
)
|
||||
elif isinstance(obs, AgentDelegateObservation):
|
||||
text = obs_prefix + truncate_content(
|
||||
obs.outputs['content'] if 'content' in obs.outputs else '',
|
||||
@@ -335,6 +348,7 @@ class CodeActAgent(Agent):
|
||||
}
|
||||
if self.function_calling_active:
|
||||
params['tools'] = self.tools
|
||||
params['parallel_tool_calls'] = False
|
||||
else:
|
||||
params['stop'] = [
|
||||
'</execute_ipython>',
|
||||
|
||||
@@ -5,6 +5,7 @@ This is similar to the functionality of `CodeActResponseParser`.
|
||||
|
||||
import json
|
||||
|
||||
from browsergym.core.action.highlevel import HighLevelActionSet
|
||||
from litellm import (
|
||||
ChatCompletionToolParam,
|
||||
ChatCompletionToolParamFunctionChunk,
|
||||
@@ -16,6 +17,7 @@ from openhands.events.action import (
|
||||
Action,
|
||||
AgentDelegateAction,
|
||||
AgentFinishAction,
|
||||
BrowseInteractiveAction,
|
||||
CmdRunAction,
|
||||
FileEditAction,
|
||||
IPythonRunCellAction,
|
||||
@@ -272,24 +274,146 @@ StrReplaceEditorTool = ChatCompletionToolParam(
|
||||
),
|
||||
)
|
||||
|
||||
_BROWSER_DELEGATION = """Delegate the task to another browsing agent.
|
||||
The assistant should delegate the task if it needs to browse the Internet.
|
||||
# from browsergym/core/action/highlevel.py
|
||||
_browser_action_space = HighLevelActionSet(
|
||||
subsets=['bid', 'nav'],
|
||||
strict=False, # less strict on the parsing of the actions
|
||||
multiaction=True, # enable to agent to take multiple actions at once
|
||||
)
|
||||
|
||||
|
||||
_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
|
||||
The following 15 functions are available. Nothing else is supported.
|
||||
|
||||
goto(url: str)
|
||||
Description: Navigate to a url.
|
||||
Examples:
|
||||
goto('http://www.example.com')
|
||||
|
||||
go_back()
|
||||
Description: Navigate to the previous page in history.
|
||||
Examples:
|
||||
go_back()
|
||||
|
||||
go_forward()
|
||||
Description: Navigate to the next page in history.
|
||||
Examples:
|
||||
go_forward()
|
||||
|
||||
noop(wait_ms: float = 1000)
|
||||
Description: Do nothing, and optionally wait for the given time (in milliseconds).
|
||||
You can use this to get the current page content and/or wait for the page to load.
|
||||
Examples:
|
||||
noop()
|
||||
|
||||
noop(500)
|
||||
|
||||
scroll(delta_x: float, delta_y: float)
|
||||
Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
|
||||
Examples:
|
||||
scroll(0, 200)
|
||||
|
||||
scroll(-50.2, -100.5)
|
||||
|
||||
fill(bid: str, value: str)
|
||||
Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
|
||||
Examples:
|
||||
fill('237', 'example value')
|
||||
|
||||
fill('45', 'multi-line\nexample')
|
||||
|
||||
fill('a12', 'example with "quotes"')
|
||||
|
||||
select_option(bid: str, options: str | list[str])
|
||||
Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
|
||||
Examples:
|
||||
select_option('a48', 'blue')
|
||||
|
||||
select_option('c48', ['red', 'green', 'blue'])
|
||||
|
||||
click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
|
||||
Description: Click an element.
|
||||
Examples:
|
||||
click('a51')
|
||||
|
||||
click('b22', button='right')
|
||||
|
||||
click('48', button='middle', modifiers=['Shift'])
|
||||
|
||||
dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
|
||||
Description: Double click an element.
|
||||
Examples:
|
||||
dblclick('12')
|
||||
|
||||
dblclick('ca42', button='right')
|
||||
|
||||
dblclick('178', button='middle', modifiers=['Shift'])
|
||||
|
||||
hover(bid: str)
|
||||
Description: Hover over an element.
|
||||
Examples:
|
||||
hover('b8')
|
||||
|
||||
press(bid: str, key_comb: str)
|
||||
Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
|
||||
Examples:
|
||||
press('88', 'Backspace')
|
||||
|
||||
press('a26', 'ControlOrMeta+a')
|
||||
|
||||
press('a61', 'Meta+Shift+t')
|
||||
|
||||
focus(bid: str)
|
||||
Description: Focus the matching element.
|
||||
Examples:
|
||||
focus('b455')
|
||||
|
||||
clear(bid: str)
|
||||
Description: Clear the input field.
|
||||
Examples:
|
||||
clear('996')
|
||||
|
||||
drag_and_drop(from_bid: str, to_bid: str)
|
||||
Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
|
||||
Examples:
|
||||
drag_and_drop('56', '498')
|
||||
|
||||
upload_file(bid: str, file: str | list[str])
|
||||
Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
|
||||
Examples:
|
||||
upload_file('572', '/home/user/my_receipt.pdf')
|
||||
|
||||
upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
|
||||
|
||||
Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
|
||||
More than 2-3 actions usually leads to failure or unexpected behavior. Example:
|
||||
fill('a12', 'example with "quotes"')
|
||||
click('a51')
|
||||
click('48', button='middle', modifiers=['Shift'])
|
||||
"""
|
||||
|
||||
BrowserDelegationTool = ChatCompletionToolParam(
|
||||
for _, action in _browser_action_space.action_set.items():
|
||||
assert (
|
||||
action.signature in _BROWSER_DESCRIPTION
|
||||
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
|
||||
assert (
|
||||
action.description in _BROWSER_DESCRIPTION
|
||||
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
|
||||
|
||||
BrowserTool = ChatCompletionToolParam(
|
||||
type='function',
|
||||
function=ChatCompletionToolParamFunctionChunk(
|
||||
name='delegate_to_browsing_agent',
|
||||
description=_BROWSER_DELEGATION,
|
||||
name='browser',
|
||||
description=_BROWSER_DESCRIPTION,
|
||||
parameters={
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'task': {
|
||||
'code': {
|
||||
'type': 'string',
|
||||
'description': 'The task for the browsing agent to execute. It should include all the necessary context and specify what information the browsing agent should return.',
|
||||
},
|
||||
'description': 'The Python code that interacts with the browser.',
|
||||
}
|
||||
},
|
||||
'required': ['task'],
|
||||
'required': ['code'],
|
||||
},
|
||||
),
|
||||
)
|
||||
@@ -357,6 +481,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
|
||||
f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
|
||||
)
|
||||
action = IPythonRunCellAction(code=code, include_extra=False)
|
||||
elif tool_call.function.name == 'browser':
|
||||
action = BrowseInteractiveAction(browser_actions=arguments['code'])
|
||||
else:
|
||||
raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
|
||||
|
||||
@@ -381,13 +507,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
|
||||
|
||||
|
||||
def get_tools(
|
||||
codeact_enable_browsing_delegate: bool = False,
|
||||
codeact_enable_browsing: bool = False,
|
||||
codeact_enable_llm_editor: bool = False,
|
||||
codeact_enable_jupyter: bool = False,
|
||||
) -> list[ChatCompletionToolParam]:
|
||||
tools = [CmdRunTool, FinishTool]
|
||||
if codeact_enable_browsing_delegate:
|
||||
tools.append(BrowserDelegationTool)
|
||||
if codeact_enable_browsing:
|
||||
tools.append(BrowserTool)
|
||||
if codeact_enable_jupyter:
|
||||
tools.append(IPythonTool)
|
||||
if codeact_enable_llm_editor:
|
||||
|
||||
@@ -9,7 +9,7 @@ class AgentConfig:
|
||||
|
||||
Attributes:
|
||||
function_calling: Whether function calling is enabled. Default is True.
|
||||
codeact_enable_browsing_delegate: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
|
||||
codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
|
||||
codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
|
||||
codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
|
||||
micro_agent_name: The name of the micro agent to use for this agent.
|
||||
@@ -19,7 +19,7 @@ class AgentConfig:
|
||||
"""
|
||||
|
||||
function_calling: bool = True
|
||||
codeact_enable_browsing_delegate: bool = True
|
||||
codeact_enable_browsing: bool = True
|
||||
codeact_enable_llm_editor: bool = False
|
||||
codeact_enable_jupyter: bool = True
|
||||
micro_agent_name: str | None = None
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from browsergym.utils.obs import flatten_axtree_to_str
|
||||
|
||||
from openhands.core.schema import ObservationType
|
||||
from openhands.events.observation.observation import Observation
|
||||
|
||||
@@ -29,7 +31,7 @@ class BrowserOutputObservation(Observation):
|
||||
return 'Visited ' + self.url
|
||||
|
||||
def __str__(self) -> str:
|
||||
return (
|
||||
ret = (
|
||||
'**BrowserOutputObservation**\n'
|
||||
f'URL: {self.url}\n'
|
||||
f'Error: {self.error}\n'
|
||||
@@ -38,5 +40,47 @@ class BrowserOutputObservation(Observation):
|
||||
f'Last browser action: {self.last_browser_action}\n'
|
||||
f'Last browser action error: {self.last_browser_action_error}\n'
|
||||
f'Focused element bid: {self.focused_element_bid}\n'
|
||||
f'CONTENT: {self.content}\n'
|
||||
f'Content: {self.content}\n'
|
||||
)
|
||||
ret += '--- Agent Observation ---\n'
|
||||
ret += self.get_agent_obs_text()
|
||||
return ret
|
||||
|
||||
def get_agent_obs_text(self) -> str:
|
||||
"""Get a concise text that will be shown to the agent."""
|
||||
text = f'[Current URL: {self.url}]\n'
|
||||
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
|
||||
if self.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
'The following error occurred when executing the last action:\n'
|
||||
f'{self.last_browser_action_error}\n'
|
||||
'================ END error message ===============\n'
|
||||
)
|
||||
else:
|
||||
text += '[Action executed successfully.]\n'
|
||||
|
||||
try:
|
||||
# We do not filter visible only here because we want to show the full content
|
||||
# of the web page to the agent for simplicity.
|
||||
# FIXME: handle the case when the web page is too large
|
||||
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
|
||||
text += (
|
||||
f'============== BEGIN accessibility tree ==============\n'
|
||||
f'{cur_axtree_txt}\n'
|
||||
f'============== END accessibility tree ==============\n'
|
||||
)
|
||||
except Exception as e:
|
||||
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
|
||||
return text
|
||||
|
||||
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
self.axtree_object,
|
||||
extra_properties=self.extra_element_properties,
|
||||
with_clickable=True,
|
||||
skip_generic=False,
|
||||
filter_visible_only=filter_visible_only,
|
||||
)
|
||||
self._axtree_str = cur_axtree_txt
|
||||
return cur_axtree_txt
|
||||
|
||||
@@ -81,7 +81,10 @@ class BrowserEnv:
|
||||
raise ValueError(
|
||||
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
|
||||
)
|
||||
env = gym.make(self.browsergym_eval_env)
|
||||
env = gym.make(
|
||||
self.browsergym_eval_env,
|
||||
tags_to_mark='all',
|
||||
)
|
||||
else:
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
@@ -89,6 +92,7 @@ class BrowserEnv:
|
||||
wait_for_user_message=False,
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
tags_to_mark='all',
|
||||
)
|
||||
|
||||
obs, info = env.reset()
|
||||
|
||||
Reference in New Issue
Block a user