feat(agent, CodeAct 2.2): native CodeAct support for Browsing (#4667)

Co-authored-by: tofarr <tofarr@gmail.com>
This commit is contained in:
Xingyao Wang
2024-11-04 10:27:27 -06:00
committed by GitHub
parent f0af90bff3
commit 966da7b7c8
12 changed files with 346 additions and 55 deletions

View File

@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
update_llm_config_for_completions_logging,
)
from openhands.controller.state.state import State
from openhands.core.config import (
@@ -55,18 +56,14 @@ def get_config(
workspace_base=None,
workspace_mount_path=None,
)
if metadata.llm_config.log_completions:
metadata.llm_config.log_completions_folder = os.path.join(
metadata.eval_output_dir, 'llm_completions', instance_id
config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, instance_id
)
logger.info(
f'Logging LLM completions for instance {instance_id} to '
f'{metadata.llm_config.log_completions_folder}'
)
config.set_llm_config(metadata.llm_config)
)
agent_config = AgentConfig(
codeact_enable_jupyter=True,
codeact_enable_browsing_delegate=True,
codeact_enable_browsing=True,
codeact_enable_llm_editor=False,
)
config.set_agent_config(agent_config)

View File

@@ -0,0 +1,44 @@
from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
from openhands.events.action import AgentFinishAction, MessageAction
from openhands.events.event import Event
from openhands.events.observation import AgentDelegateObservation
from openhands.runtime.base import Runtime
class Test(BaseIntegrationTest):
INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
@classmethod
def initialize_runtime(cls, runtime: Runtime) -> None:
pass
@classmethod
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
# check if the "The answer is OpenHands is all you need!" is in any message
message_actions = [
event
for event in histories
if isinstance(
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
)
]
for event in message_actions:
if isinstance(event, AgentDelegateObservation):
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
elif isinstance(event, MessageAction):
content = event.content
else:
raise ValueError(f'Unknown event type: {type(event)}')
if (
'non-commercial' in content
or 'MIT' in content
or 'Apache 2.0' in content
):
return TestResult(success=True)
return TestResult(
success=False,
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
)

View File

@@ -10,10 +10,12 @@ import pandas as pd
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
codeact_user_response,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
update_llm_config_for_completions_logging,
)
from openhands.controller.state.state import State
from openhands.core.config import (
@@ -29,7 +31,10 @@ from openhands.events.action import (
CmdRunAction,
MessageAction,
)
from openhands.events.observation import CmdOutputObservation
from openhands.events.observation import (
BrowserOutputObservation,
CmdOutputObservation,
)
from openhands.runtime.base import Runtime
from openhands.runtime.browser.browser_env import (
BROWSER_EVAL_GET_GOAL_ACTION,
@@ -37,7 +42,11 @@ from openhands.runtime.browser.browser_env import (
)
from openhands.utils.async_utils import call_async_from_sync
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
}
def get_config(
@@ -47,25 +56,32 @@ def get_config(
config = AppConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='eventstream',
runtime=os.environ.get('RUNTIME', 'eventstream'),
max_iterations=metadata.max_iterations,
sandbox=SandboxConfig(
base_container_image='xingyaoww/od-eval-miniwob:v1.0',
enable_auto_lint=True,
use_host_network=False,
browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, env_id
)
)
return config
def initialize_runtime(
runtime: Runtime,
) -> str:
) -> tuple[str, BrowserOutputObservation]:
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
@@ -85,8 +101,14 @@ def initialize_runtime(
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
goal = obs.content
# Run noop to get the initial browser observation (e.g., the page URL & content)
action = BrowseInteractiveAction(browser_actions='noop(1000)')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
return goal
return goal, obs
def complete_runtime(
@@ -117,7 +139,7 @@ def process_instance(
metadata: EvalMetadata,
reset_logger: bool = True,
) -> EvalOutput:
env_id = instance.id
env_id = instance.instance_id
config = get_config(metadata, env_id)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
@@ -129,7 +151,12 @@ def process_instance(
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime)
task_str, obs = initialize_runtime(runtime)
task_str += (
f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}'
)
state: State | None = asyncio.run(
run_controller(
config=config,
@@ -137,6 +164,9 @@ def process_instance(
content=task_str
), # take output from initialize_runtime
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
metadata.agent_class
],
)
)
@@ -159,7 +189,7 @@ def process_instance(
return_val = complete_runtime(runtime)
logger.info(f'Return value from complete_runtime: {return_val}')
reward = max(return_val['rewards'])
reward = max(return_val['rewards'], default=0)
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here

View File

@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
update_llm_config_for_completions_logging,
)
from openhands.controller.state.state import State
from openhands.core.config import (
@@ -76,15 +77,13 @@ def get_config(
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config)
if metadata.llm_config.log_completions:
metadata.llm_config.log_completions_folder = os.path.join(
metadata.eval_output_dir, 'llm_completions', instance_id
)
logger.info(
f'Logging LLM completions for instance {instance_id} to '
f'{metadata.llm_config.log_completions_folder}'
config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config,
metadata.eval_output_dir,
instance_id,
)
)
return config

View File

@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
prepare_dataset,
reset_logger_for_multiprocessing,
run_evaluation,
update_llm_config_for_completions_logging,
)
from openhands.controller.state.state import State
from openhands.core.config import (
@@ -40,6 +41,7 @@ from openhands.utils.async_utils import call_async_from_sync
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
@@ -88,6 +90,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
'5. Think about edgecases and make sure your fix handles them as well\n'
"Your thinking should be thorough and so it's fine if it's very long.\n"
)
if RUN_WITH_BROWSING:
instruction += (
'<IMPORTANT!>\n'
'You SHOULD NEVER attempt to browse the web. '
'</IMPORTANT!>\n'
)
return instruction
@@ -142,18 +151,14 @@ def get_config(
workspace_base=None,
workspace_mount_path=None,
)
if metadata.llm_config.log_completions:
metadata.llm_config.log_completions_folder = os.path.join(
metadata.eval_output_dir, 'llm_completions', instance['instance_id']
config.set_llm_config(
update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
)
logger.info(
f'Logging LLM completions for instance {instance["instance_id"]} to '
f'{metadata.llm_config.log_completions_folder}'
)
config.set_llm_config(metadata.llm_config)
)
agent_config = AgentConfig(
codeact_enable_jupyter=False,
codeact_enable_browsing_delegate=False,
codeact_enable_browsing=RUN_WITH_BROWSING,
codeact_enable_llm_editor=False,
)
config.set_agent_config(agent_config)

View File

@@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
USE_INSTANCE_IMAGE=true
fi
if [ -z "$RUN_WITH_BROWSING" ]; then
echo "RUN_WITH_BROWSING not specified, use default false"
RUN_WITH_BROWSING=false
fi
if [ -z "$DATASET" ]; then
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
@@ -47,6 +52,8 @@ fi
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
get_agent_version
@@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"
fi
if [ "$RUN_WITH_BROWSING" = true ]; then
EVAL_NOTE="$EVAL_NOTE-with-browsing"
fi
if [ -n "$EXP_NAME" ]; then
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
fi

View File

@@ -411,3 +411,20 @@ def reset_logger_for_multiprocessing(
)
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)
def update_llm_config_for_completions_logging(
llm_config: LLMConfig,
eval_output_dir: str,
instance_id: str,
) -> LLMConfig:
"""Update the LLM config for logging completions."""
if llm_config.log_completions:
llm_config.log_completions_folder = os.path.join(
eval_output_dir, 'llm_completions', instance_id
)
logger.info(
f'Logging LLM completions for instance {instance_id} to '
f'{llm_config.log_completions_folder}'
)
return llm_config

View File

@@ -16,6 +16,7 @@ from openhands.events.action import (
Action,
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
@@ -23,6 +24,7 @@ from openhands.events.action import (
)
from openhands.events.observation import (
AgentDelegateObservation,
BrowserOutputObservation,
CmdOutputObservation,
FileEditObservation,
IPythonRunCellObservation,
@@ -42,7 +44,7 @@ from openhands.utils.prompt import PromptManager
class CodeActAgent(Agent):
VERSION = '2.1'
VERSION = '2.2'
"""
The Code Act Agent is a minimalist agent.
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -105,7 +107,7 @@ class CodeActAgent(Agent):
if self.function_calling_active:
# Function calling mode
self.tools = codeact_function_calling.get_tools(
codeact_enable_browsing_delegate=self.config.codeact_enable_browsing_delegate,
codeact_enable_browsing=self.config.codeact_enable_browsing,
codeact_enable_jupyter=self.config.codeact_enable_jupyter,
codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
)
@@ -142,10 +144,10 @@ class CodeActAgent(Agent):
Args:
action (Action): The action to convert. Can be one of:
- AgentDelegateAction: For delegating tasks to other agents
- CmdRunAction: For executing bash commands
- IPythonRunCellAction: For running IPython code
- FileEditAction: For editing files
- BrowseInteractiveAction: For browsing the web
- AgentFinishAction: For ending the interaction
- MessageAction: For sending messages
pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
@@ -169,6 +171,7 @@ class CodeActAgent(Agent):
CmdRunAction,
IPythonRunCellAction,
FileEditAction,
BrowseInteractiveAction,
),
) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
if self.function_calling_active:
@@ -192,6 +195,10 @@ class CodeActAgent(Agent):
)
return []
else:
assert not isinstance(action, BrowseInteractiveAction), (
'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
+ str(action)
)
content = [TextContent(text=self.action_parser.action_to_str(action))]
return [
Message(
@@ -266,6 +273,12 @@ class CodeActAgent(Agent):
elif isinstance(obs, FileEditObservation):
text = obs_prefix + truncate_content(str(obs), max_message_chars)
message = Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, BrowserOutputObservation):
text = obs.get_agent_obs_text()
message = Message(
role='user',
content=[TextContent(text=obs_prefix + text)],
)
elif isinstance(obs, AgentDelegateObservation):
text = obs_prefix + truncate_content(
obs.outputs['content'] if 'content' in obs.outputs else '',
@@ -335,6 +348,7 @@ class CodeActAgent(Agent):
}
if self.function_calling_active:
params['tools'] = self.tools
params['parallel_tool_calls'] = False
else:
params['stop'] = [
'</execute_ipython>',

View File

@@ -5,6 +5,7 @@ This is similar to the functionality of `CodeActResponseParser`.
import json
from browsergym.core.action.highlevel import HighLevelActionSet
from litellm import (
ChatCompletionToolParam,
ChatCompletionToolParamFunctionChunk,
@@ -16,6 +17,7 @@ from openhands.events.action import (
Action,
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
@@ -272,24 +274,146 @@ StrReplaceEditorTool = ChatCompletionToolParam(
),
)
_BROWSER_DELEGATION = """Delegate the task to another browsing agent.
The assistant should delegate the task if it needs to browse the Internet.
# from browsergym/core/action/highlevel.py
_browser_action_space = HighLevelActionSet(
subsets=['bid', 'nav'],
strict=False, # less strict on the parsing of the actions
multiaction=True, # enable to agent to take multiple actions at once
)
_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
The following 15 functions are available. Nothing else is supported.
goto(url: str)
Description: Navigate to a url.
Examples:
goto('http://www.example.com')
go_back()
Description: Navigate to the previous page in history.
Examples:
go_back()
go_forward()
Description: Navigate to the next page in history.
Examples:
go_forward()
noop(wait_ms: float = 1000)
Description: Do nothing, and optionally wait for the given time (in milliseconds).
You can use this to get the current page content and/or wait for the page to load.
Examples:
noop()
noop(500)
scroll(delta_x: float, delta_y: float)
Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
Examples:
scroll(0, 200)
scroll(-50.2, -100.5)
fill(bid: str, value: str)
Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
Examples:
fill('237', 'example value')
fill('45', 'multi-line\nexample')
fill('a12', 'example with "quotes"')
select_option(bid: str, options: str | list[str])
Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
Examples:
select_option('a48', 'blue')
select_option('c48', ['red', 'green', 'blue'])
click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
Description: Click an element.
Examples:
click('a51')
click('b22', button='right')
click('48', button='middle', modifiers=['Shift'])
dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
Description: Double click an element.
Examples:
dblclick('12')
dblclick('ca42', button='right')
dblclick('178', button='middle', modifiers=['Shift'])
hover(bid: str)
Description: Hover over an element.
Examples:
hover('b8')
press(bid: str, key_comb: str)
Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
Examples:
press('88', 'Backspace')
press('a26', 'ControlOrMeta+a')
press('a61', 'Meta+Shift+t')
focus(bid: str)
Description: Focus the matching element.
Examples:
focus('b455')
clear(bid: str)
Description: Clear the input field.
Examples:
clear('996')
drag_and_drop(from_bid: str, to_bid: str)
Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
Examples:
drag_and_drop('56', '498')
upload_file(bid: str, file: str | list[str])
Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
Examples:
upload_file('572', '/home/user/my_receipt.pdf')
upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
More than 2-3 actions usually leads to failure or unexpected behavior. Example:
fill('a12', 'example with "quotes"')
click('a51')
click('48', button='middle', modifiers=['Shift'])
"""
BrowserDelegationTool = ChatCompletionToolParam(
for _, action in _browser_action_space.action_set.items():
assert (
action.signature in _BROWSER_DESCRIPTION
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
assert (
action.description in _BROWSER_DESCRIPTION
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
BrowserTool = ChatCompletionToolParam(
type='function',
function=ChatCompletionToolParamFunctionChunk(
name='delegate_to_browsing_agent',
description=_BROWSER_DELEGATION,
name='browser',
description=_BROWSER_DESCRIPTION,
parameters={
'type': 'object',
'properties': {
'task': {
'code': {
'type': 'string',
'description': 'The task for the browsing agent to execute. It should include all the necessary context and specify what information the browsing agent should return.',
},
'description': 'The Python code that interacts with the browser.',
}
},
'required': ['task'],
'required': ['code'],
},
),
)
@@ -357,6 +481,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
)
action = IPythonRunCellAction(code=code, include_extra=False)
elif tool_call.function.name == 'browser':
action = BrowseInteractiveAction(browser_actions=arguments['code'])
else:
raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
@@ -381,13 +507,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
def get_tools(
codeact_enable_browsing_delegate: bool = False,
codeact_enable_browsing: bool = False,
codeact_enable_llm_editor: bool = False,
codeact_enable_jupyter: bool = False,
) -> list[ChatCompletionToolParam]:
tools = [CmdRunTool, FinishTool]
if codeact_enable_browsing_delegate:
tools.append(BrowserDelegationTool)
if codeact_enable_browsing:
tools.append(BrowserTool)
if codeact_enable_jupyter:
tools.append(IPythonTool)
if codeact_enable_llm_editor:

View File

@@ -9,7 +9,7 @@ class AgentConfig:
Attributes:
function_calling: Whether function calling is enabled. Default is True.
codeact_enable_browsing_delegate: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
micro_agent_name: The name of the micro agent to use for this agent.
@@ -19,7 +19,7 @@ class AgentConfig:
"""
function_calling: bool = True
codeact_enable_browsing_delegate: bool = True
codeact_enable_browsing: bool = True
codeact_enable_llm_editor: bool = False
codeact_enable_jupyter: bool = True
micro_agent_name: str | None = None

View File

@@ -1,5 +1,7 @@
from dataclasses import dataclass, field
from browsergym.utils.obs import flatten_axtree_to_str
from openhands.core.schema import ObservationType
from openhands.events.observation.observation import Observation
@@ -29,7 +31,7 @@ class BrowserOutputObservation(Observation):
return 'Visited ' + self.url
def __str__(self) -> str:
return (
ret = (
'**BrowserOutputObservation**\n'
f'URL: {self.url}\n'
f'Error: {self.error}\n'
@@ -38,5 +40,47 @@ class BrowserOutputObservation(Observation):
f'Last browser action: {self.last_browser_action}\n'
f'Last browser action error: {self.last_browser_action_error}\n'
f'Focused element bid: {self.focused_element_bid}\n'
f'CONTENT: {self.content}\n'
f'Content: {self.content}\n'
)
ret += '--- Agent Observation ---\n'
ret += self.get_agent_obs_text()
return ret
def get_agent_obs_text(self) -> str:
"""Get a concise text that will be shown to the agent."""
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
cur_axtree_txt = flatten_axtree_to_str(
self.axtree_object,
extra_properties=self.extra_element_properties,
with_clickable=True,
skip_generic=False,
filter_visible_only=filter_visible_only,
)
self._axtree_str = cur_axtree_txt
return cur_axtree_txt

View File

@@ -81,7 +81,10 @@ class BrowserEnv:
raise ValueError(
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
)
env = gym.make(self.browsergym_eval_env)
env = gym.make(
self.browsergym_eval_env,
tags_to_mark='all',
)
else:
env = gym.make(
'browsergym/openended',
@@ -89,6 +92,7 @@ class BrowserEnv:
wait_for_user_message=False,
headless=True,
disable_env_checker=True,
tags_to_mark='all',
)
obs, info = env.reset()