mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
[Eval, Browser] Refactor Browser Env so it works with EventStreamRuntime for Browsing Evaluation (#3235)
* refactor browser env so it works with eventstream runtime for eval * fix browsergym environment
This commit is contained in:
@@ -154,6 +154,9 @@ class SandboxConfig(metaclass=Singleton):
|
||||
initialize_plugins: Whether to initialize plugins.
|
||||
update_source_code: Whether to update the source code in the EventStreamRuntime.
|
||||
Used for development of EventStreamRuntime.
|
||||
browsergym_eval_env: The BrowserGym environment to use for evaluation.
|
||||
Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
|
||||
|
||||
"""
|
||||
|
||||
box_type: str = 'ssh'
|
||||
@@ -170,6 +173,7 @@ class SandboxConfig(metaclass=Singleton):
|
||||
use_host_network: bool = False
|
||||
initialize_plugins: bool = True
|
||||
update_source_code: bool = False
|
||||
browsergym_eval_env: str | None = None
|
||||
|
||||
def defaults_to_dict(self) -> dict:
|
||||
"""Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
|
||||
|
||||
@@ -11,7 +11,6 @@ class BrowserOutputObservation(Observation):
|
||||
|
||||
url: str
|
||||
screenshot: str = field(repr=False) # don't show in repr
|
||||
status_code: int = 200
|
||||
error: bool = False
|
||||
observation: str = ObservationType.BROWSE
|
||||
# do not include in the memory
|
||||
@@ -34,12 +33,12 @@ class BrowserOutputObservation(Observation):
|
||||
return (
|
||||
'**BrowserOutputObservation**\n'
|
||||
f'URL: {self.url}\n'
|
||||
f'Status code: {self.status_code}\n'
|
||||
f'Error: {self.error}\n'
|
||||
f'Open pages: {self.open_pages_urls}\n'
|
||||
f'Active page index: {self.active_page_index}\n'
|
||||
f'Last browser action: {self.last_browser_action}\n'
|
||||
f'Last browser action error: {self.last_browser_action_error}\n'
|
||||
f'Focused element bid: {self.focused_element_bid}\n'
|
||||
f'axTree: {self.axtree_object}\n'
|
||||
f'CONTENT: {self.content}\n'
|
||||
)
|
||||
|
||||
@@ -3,7 +3,6 @@ import base64
|
||||
import io
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
|
||||
@@ -18,41 +17,24 @@ from PIL import Image
|
||||
from opendevin.core.exceptions import BrowserInitException
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
|
||||
BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
|
||||
BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
|
||||
|
||||
|
||||
class BrowserEnv:
|
||||
def __init__(
|
||||
self,
|
||||
browsergym_eval: str = '',
|
||||
browsergym_eval_save_dir: str = '',
|
||||
):
|
||||
def __init__(self, browsergym_eval_env: str | None = None):
|
||||
self.html_text_converter = self.get_html_text_converter()
|
||||
self.eval_mode = False
|
||||
self.eval_dir = ''
|
||||
# EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
|
||||
self.browsergym_eval = browsergym_eval
|
||||
self.browsergym_eval_save_dir = browsergym_eval_save_dir
|
||||
if self.browsergym_eval:
|
||||
assert (
|
||||
self.browsergym_eval_save_dir
|
||||
), 'browsergym_eval_save_dir must be provided for evaluation.'
|
||||
self.eval_mode = True
|
||||
self.eval_dir = os.path.join(
|
||||
self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
|
||||
)
|
||||
os.makedirs(self.eval_dir, exist_ok=True)
|
||||
|
||||
# EVAL only: browsergym_eval_env must be provided for evaluation
|
||||
self.browsergym_eval_env = browsergym_eval_env
|
||||
self.eval_mode = bool(browsergym_eval_env)
|
||||
|
||||
# Initialize browser environment process
|
||||
multiprocessing.set_start_method('spawn', force=True)
|
||||
self.browser_side, self.agent_side = multiprocessing.Pipe()
|
||||
|
||||
try:
|
||||
self.original_cwd = os.getcwd()
|
||||
except FileNotFoundError:
|
||||
logger.warning(
|
||||
'Current working directory does not exist. Using /tmp as fallback.'
|
||||
)
|
||||
self.original_cwd = '/tmp'
|
||||
os.chdir('/tmp')
|
||||
|
||||
self.init_browser()
|
||||
atexit.register(self.close)
|
||||
|
||||
@@ -74,17 +56,6 @@ class BrowserEnv:
|
||||
)
|
||||
def init_browser(self):
|
||||
logger.info('Starting browser env...')
|
||||
|
||||
# Ensure we're in a valid directory before starting the process
|
||||
try:
|
||||
os.chdir(self.original_cwd)
|
||||
logger.debug(f'Changed back to original directory: {self.original_cwd}')
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to change to original directory: {e}')
|
||||
# If we can't change to the original directory, try to use a known valid directory
|
||||
os.chdir('/tmp')
|
||||
logger.debug('Changed to /tmp directory as fallback')
|
||||
|
||||
try:
|
||||
self.process = multiprocessing.Process(target=self.browser_process)
|
||||
self.process.start()
|
||||
@@ -98,8 +69,17 @@ class BrowserEnv:
|
||||
|
||||
def browser_process(self):
|
||||
if self.eval_mode:
|
||||
logger.info('Creating browser env for evaluation purpose.')
|
||||
env = gym.make(self.browsergym_eval)
|
||||
assert self.browsergym_eval_env is not None
|
||||
logger.info('Initializing browser env for web browsing evaluation.')
|
||||
if 'webarena' in self.browsergym_eval_env:
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
elif 'miniwob' in self.browsergym_eval_env:
|
||||
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
|
||||
)
|
||||
env = gym.make(self.browsergym_eval_env)
|
||||
else:
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
@@ -108,20 +88,22 @@ class BrowserEnv:
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
)
|
||||
|
||||
obs, info = env.reset()
|
||||
# EVAL only: save the goal into file for evaluation
|
||||
|
||||
# EVAL ONLY: save the goal into file for evaluation
|
||||
self.eval_goal = None
|
||||
self.eval_rewards: list[float] = []
|
||||
if self.eval_mode:
|
||||
rewards = [] # store rewards if in eval mode
|
||||
logger.info(obs['goal'])
|
||||
with open(
|
||||
os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
|
||||
) as f:
|
||||
f.write(obs['goal'])
|
||||
logger.info(f"Browsing goal: {obs['goal']}")
|
||||
self.eval_goal = obs['goal']
|
||||
|
||||
logger.info('Browser env started.')
|
||||
while True:
|
||||
try:
|
||||
if self.browser_side.poll(timeout=0.01):
|
||||
unique_request_id, action_data = self.browser_side.recv()
|
||||
|
||||
# shutdown the browser environment
|
||||
if unique_request_id == 'SHUTDOWN':
|
||||
logger.info('SHUTDOWN recv, shutting down browser env...')
|
||||
@@ -130,17 +112,29 @@ class BrowserEnv:
|
||||
elif unique_request_id == 'IS_ALIVE':
|
||||
self.browser_side.send(('ALIVE', None))
|
||||
continue
|
||||
|
||||
# EVAL ONLY: Get evaluation info
|
||||
if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
|
||||
self.browser_side.send(
|
||||
(unique_request_id, {'text_content': self.eval_goal})
|
||||
)
|
||||
continue
|
||||
elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
|
||||
self.browser_side.send(
|
||||
(
|
||||
unique_request_id,
|
||||
{'text_content': json.dumps(self.eval_rewards)},
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
action = action_data['action']
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
# EVAL only: save the rewards into file for evaluation
|
||||
|
||||
# EVAL ONLY: Save the rewards into file for evaluation
|
||||
if self.eval_mode:
|
||||
rewards.append(reward)
|
||||
with open(
|
||||
os.path.join(self.eval_dir, 'rewards.json'),
|
||||
'w',
|
||||
encoding='utf-8',
|
||||
) as f:
|
||||
f.write(json.dumps(rewards))
|
||||
self.eval_rewards.append(reward)
|
||||
|
||||
# add text content of the page
|
||||
html_str = flatten_dom_to_str(obs['dom_object'])
|
||||
obs['text_content'] = self.html_text_converter.handle(html_str)
|
||||
@@ -158,6 +152,7 @@ class BrowserEnv:
|
||||
return
|
||||
|
||||
def step(self, action_str: str, timeout: float = 30) -> dict:
|
||||
"""Execute an action in the browser environment and return the observation."""
|
||||
unique_request_id = str(uuid.uuid4())
|
||||
self.agent_side.send((unique_request_id, {'action': action_str}))
|
||||
start_time = time.time()
|
||||
|
||||
@@ -32,21 +32,23 @@ async def browse(
|
||||
obs = browser.step(action_str)
|
||||
return BrowserOutputObservation(
|
||||
content=obs['text_content'], # text content of the page
|
||||
open_pages_urls=obs['open_pages_urls'], # list of open pages
|
||||
active_page_index=obs['active_page_index'], # index of the active page
|
||||
dom_object=obs['dom_object'], # DOM object
|
||||
axtree_object=obs['axtree_object'], # accessibility tree object
|
||||
extra_element_properties=obs[
|
||||
'extra_element_properties'
|
||||
], # extra element properties
|
||||
last_browser_action=obs['last_action'], # last browser env action performed
|
||||
focused_element_bid=obs['focused_element_bid'], # focused element bid
|
||||
screenshot=obs['screenshot'], # base64-encoded screenshot, png
|
||||
url=obs['url'], # URL of the page
|
||||
error=True if obs['last_action_error'] else False, # error flag
|
||||
last_browser_action_error=obs[
|
||||
'last_action_error'
|
||||
], # last browser env action error
|
||||
url=obs.get('url', ''), # URL of the page
|
||||
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
|
||||
open_pages_urls=obs.get('open_pages_urls', []), # list of open pages
|
||||
active_page_index=obs.get(
|
||||
'active_page_index', -1
|
||||
), # index of the active page
|
||||
dom_object=obs.get('dom_object', {}), # DOM object
|
||||
axtree_object=obs.get('axtree_object', {}), # accessibility tree object
|
||||
extra_element_properties=obs.get('extra_element_properties', {}),
|
||||
focused_element_bid=obs.get(
|
||||
'focused_element_bid', None
|
||||
), # focused element bid
|
||||
last_browser_action=obs.get(
|
||||
'last_action', ''
|
||||
), # last browser env action performed
|
||||
last_browser_action_error=obs.get('last_action_error', ''),
|
||||
error=True if obs.get('last_action_error', '') else False, # error flag
|
||||
)
|
||||
except Exception as e:
|
||||
return BrowserOutputObservation(
|
||||
|
||||
@@ -64,7 +64,12 @@ class RuntimeClient:
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, plugins_to_load: list[Plugin], work_dir: str, username: str, user_id: int
|
||||
self,
|
||||
plugins_to_load: list[Plugin],
|
||||
work_dir: str,
|
||||
username: str,
|
||||
user_id: int,
|
||||
browsergym_eval_env: str | None,
|
||||
) -> None:
|
||||
self.plugins_to_load = plugins_to_load
|
||||
self.username = username
|
||||
@@ -74,7 +79,7 @@ class RuntimeClient:
|
||||
self._init_bash_shell(self.pwd, self.username)
|
||||
self.lock = asyncio.Lock()
|
||||
self.plugins: dict[str, Plugin] = {}
|
||||
self.browser = BrowserEnv()
|
||||
self.browser = BrowserEnv(browsergym_eval_env)
|
||||
|
||||
async def ainit(self):
|
||||
for plugin in self.plugins_to_load:
|
||||
@@ -362,6 +367,12 @@ if __name__ == '__main__':
|
||||
'--username', type=str, help='User to run as', default='opendevin'
|
||||
)
|
||||
parser.add_argument('--user-id', type=int, help='User ID to run as', default=1000)
|
||||
parser.add_argument(
|
||||
'--browsergym-eval-env',
|
||||
type=str,
|
||||
help='BrowserGym environment used for browser evaluation',
|
||||
default=None,
|
||||
)
|
||||
# example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -382,6 +393,7 @@ if __name__ == '__main__':
|
||||
work_dir=args.working_dir,
|
||||
username=args.username,
|
||||
user_id=args.user_id,
|
||||
browsergym_eval_env=args.browsergym_eval_env,
|
||||
)
|
||||
await client.ainit()
|
||||
yield
|
||||
|
||||
@@ -147,6 +147,12 @@ class EventStreamRuntime(Runtime):
|
||||
|
||||
logger.info(f'run_as_devin: `{self.config.run_as_devin}`')
|
||||
|
||||
if self.config.sandbox.browsergym_eval_env is not None:
|
||||
browsergym_arg = (
|
||||
f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}'
|
||||
)
|
||||
else:
|
||||
browsergym_arg = ''
|
||||
container = self.docker_client.containers.run(
|
||||
self.container_image,
|
||||
command=(
|
||||
@@ -156,7 +162,8 @@ class EventStreamRuntime(Runtime):
|
||||
f'--working-dir {sandbox_workspace_dir} '
|
||||
f'{plugin_arg}'
|
||||
f'--username {"opendevin" if self.config.run_as_devin else "root"} '
|
||||
f'--user-id {self.config.sandbox.user_id}'
|
||||
f'--user-id {self.config.sandbox.user_id} '
|
||||
f'{browsergym_arg}'
|
||||
),
|
||||
network_mode=network_mode,
|
||||
ports=port_mapping,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
@@ -13,6 +14,7 @@ from opendevin.core.config import AppConfig, SandboxConfig, load_from_env
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.events import EventStream
|
||||
from opendevin.events.action import (
|
||||
BrowseInteractiveAction,
|
||||
BrowseURLAction,
|
||||
CmdRunAction,
|
||||
FileReadAction,
|
||||
@@ -29,6 +31,7 @@ from opendevin.events.observation import (
|
||||
)
|
||||
from opendevin.runtime.client.runtime import EventStreamRuntime
|
||||
from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
|
||||
from opendevin.runtime.runtime import Runtime
|
||||
from opendevin.runtime.server.runtime import ServerRuntime
|
||||
from opendevin.storage import get_file_store
|
||||
|
||||
@@ -95,7 +98,8 @@ async def _load_runtime(
|
||||
run_as_devin: bool = True,
|
||||
enable_auto_lint: bool = False,
|
||||
container_image: str | None = None,
|
||||
):
|
||||
browsergym_eval_env: str | None = None,
|
||||
) -> Runtime:
|
||||
sid = 'test'
|
||||
cli_session = 'main_test'
|
||||
# AgentSkills need to be initialized **before** Jupyter
|
||||
@@ -104,7 +108,10 @@ async def _load_runtime(
|
||||
config = AppConfig(
|
||||
workspace_base=temp_dir,
|
||||
workspace_mount_path=temp_dir,
|
||||
sandbox=SandboxConfig(use_host_network=True),
|
||||
sandbox=SandboxConfig(
|
||||
use_host_network=True,
|
||||
browsergym_eval_env=browsergym_eval_env,
|
||||
),
|
||||
)
|
||||
load_from_env(config, os.environ)
|
||||
config.run_as_devin = run_as_devin
|
||||
@@ -120,7 +127,9 @@ async def _load_runtime(
|
||||
# NOTE: we will use the default container image specified in the config.sandbox
|
||||
# if it is an official od_runtime image.
|
||||
cur_container_image = config.sandbox.container_image
|
||||
if 'od_runtime' not in cur_container_image:
|
||||
if 'od_runtime' not in cur_container_image and cur_container_image not in {
|
||||
'xingyaoww/od-eval-miniwob:v1.0'
|
||||
}: # a special exception list
|
||||
cur_container_image = 'ubuntu:22.04'
|
||||
logger.warning(
|
||||
f'`{config.sandbox.container_image}` is not an od_runtime image. Will use `{cur_container_image}` as the container image for testing.'
|
||||
@@ -387,7 +396,6 @@ async def test_simple_browse(temp_dir, box_class, run_as_devin):
|
||||
|
||||
assert isinstance(obs, BrowserOutputObservation)
|
||||
assert 'http://localhost:8000' in obs.url
|
||||
assert obs.status_code == 200
|
||||
assert not obs.error
|
||||
assert obs.open_pages_urls == ['http://localhost:8000/']
|
||||
assert obs.active_page_index == 0
|
||||
@@ -407,6 +415,53 @@ async def test_simple_browse(temp_dir, box_class, run_as_devin):
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_browsergym_eval_env(temp_dir):
|
||||
runtime = await _load_runtime(
|
||||
temp_dir,
|
||||
# only supported in event stream runtime
|
||||
box_class=EventStreamRuntime,
|
||||
run_as_devin=False, # need root permission to access file
|
||||
container_image='xingyaoww/od-eval-miniwob:v1.0',
|
||||
browsergym_eval_env='browsergym/miniwob.choose-list',
|
||||
)
|
||||
from opendevin.runtime.browser.browser_env import (
|
||||
BROWSER_EVAL_GET_GOAL_ACTION,
|
||||
BROWSER_EVAL_GET_REWARDS_ACTION,
|
||||
)
|
||||
|
||||
# Test browse
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = await runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
assert isinstance(obs, BrowserOutputObservation)
|
||||
assert not obs.error
|
||||
assert 'Select' in obs.content
|
||||
assert 'from the list and click Submit' in obs.content
|
||||
|
||||
# Make sure the browser can produce observation in eva[l
|
||||
action = BrowseInteractiveAction(browser_actions='noop()')
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = await runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert (
|
||||
obs.url.strip()
|
||||
== 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html'
|
||||
)
|
||||
|
||||
# Make sure the rewards are working
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = await runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert json.loads(obs.content) == [0.0]
|
||||
|
||||
await runtime.close()
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_multiline_command(temp_dir, box_class):
|
||||
runtime = await _load_runtime(temp_dir, box_class)
|
||||
|
||||
Reference in New Issue
Block a user