[OH-Versa] Add remaining browsing & GAIA eval improvement (#9015)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
Ryan H. Tran
2025-06-25 12:36:15 +07:00
committed by GitHub
parent 76914e3c26
commit dfa54673d2
16 changed files with 383 additions and 29 deletions

View File

@@ -94,6 +94,9 @@ class BrowserEnv:
headless=True,
disable_env_checker=True,
tags_to_mark='all',
timeout=100000,
pw_context_kwargs={'accept_downloads': True},
pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
)
obs, info = env.reset()
@@ -105,6 +108,7 @@ class BrowserEnv:
if self.eval_mode:
self.eval_goal = obs['goal']
if 'goal_object' in obs:
obs['goal_object'] = list(obs['goal_object'])
if len(obs['goal_object']) > 0:
self.eval_goal = obs['goal_object'][0]['text']
for message in obs['goal_object']:
@@ -182,7 +186,7 @@ class BrowserEnv:
pass
return
def step(self, action_str: str, timeout: float = 100) -> dict:
def step(self, action_str: str, timeout: float = 120) -> dict:
"""Execute an action in the browser environment and return the observation."""
unique_request_id = str(uuid.uuid4())
self.agent_side.send((unique_request_id, {'action': action_str}))

View File

@@ -59,13 +59,22 @@ def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
cur_axtree_txt = get_axtree_str(
obs.axtree_object,
obs.extra_element_properties,
filter_visible_only=False,
)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
filter_visible_only=obs.filter_visible_only,
)
if not obs.filter_visible_only:
text += (
f'Accessibility tree of the COMPLETE webpage:\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
else:
text += (
f'Accessibility tree of the VISIBLE portion of the webpage (accessibility tree of complete webpage is too large and you may need to scroll to view remaining portion of the webpage):\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text