From b7061f4497bbdc8da8afa3936c10ad867545f5d4 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 3 Aug 2024 23:06:37 +0800 Subject: [PATCH] [Eval, Browser] Refactor Browser Env so it works with `EventStreamRuntime` for Browsing Evaluation (#3235) * refactor browser env so it works with eventstream runtime for eval * fix browsergym environment --- opendevin/core/config.py | 4 + opendevin/events/observation/browse.py | 3 +- opendevin/runtime/browser/browser_env.py | 105 +++++++++++------------ opendevin/runtime/browser/utils.py | 32 +++---- opendevin/runtime/client/client.py | 16 +++- opendevin/runtime/client/runtime.py | 9 +- tests/unit/test_runtime.py | 63 +++++++++++++- 7 files changed, 153 insertions(+), 79 deletions(-) diff --git a/opendevin/core/config.py b/opendevin/core/config.py index 17d43a1686..8adaf59a87 100644 --- a/opendevin/core/config.py +++ b/opendevin/core/config.py @@ -154,6 +154,9 @@ class SandboxConfig(metaclass=Singleton): initialize_plugins: Whether to initialize plugins. update_source_code: Whether to update the source code in the EventStreamRuntime. Used for development of EventStreamRuntime. + browsergym_eval_env: The BrowserGym environment to use for evaluation. + Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples. + """ box_type: str = 'ssh' @@ -170,6 +173,7 @@ class SandboxConfig(metaclass=Singleton): use_host_network: bool = False initialize_plugins: bool = True update_source_code: bool = False + browsergym_eval_env: str | None = None def defaults_to_dict(self) -> dict: """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional.""" diff --git a/opendevin/events/observation/browse.py b/opendevin/events/observation/browse.py index eaf44ac101..901b9ffb75 100644 --- a/opendevin/events/observation/browse.py +++ b/opendevin/events/observation/browse.py @@ -11,7 +11,6 @@ class BrowserOutputObservation(Observation): url: str screenshot: str = field(repr=False) # don't show in repr - status_code: int = 200 error: bool = False observation: str = ObservationType.BROWSE # do not include in the memory @@ -34,12 +33,12 @@ class BrowserOutputObservation(Observation): return ( '**BrowserOutputObservation**\n' f'URL: {self.url}\n' - f'Status code: {self.status_code}\n' f'Error: {self.error}\n' f'Open pages: {self.open_pages_urls}\n' f'Active page index: {self.active_page_index}\n' f'Last browser action: {self.last_browser_action}\n' f'Last browser action error: {self.last_browser_action_error}\n' f'Focused element bid: {self.focused_element_bid}\n' + f'axTree: {self.axtree_object}\n' f'CONTENT: {self.content}\n' ) diff --git a/opendevin/runtime/browser/browser_env.py b/opendevin/runtime/browser/browser_env.py index 35eb54a11e..7d060580bd 100644 --- a/opendevin/runtime/browser/browser_env.py +++ b/opendevin/runtime/browser/browser_env.py @@ -3,7 +3,6 @@ import base64 import io import json import multiprocessing -import os import time import uuid @@ -18,41 +17,24 @@ from PIL import Image from opendevin.core.exceptions import BrowserInitException from opendevin.core.logger import opendevin_logger as logger +BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL' +BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS' + class BrowserEnv: - def __init__( - self, - browsergym_eval: str = '', - browsergym_eval_save_dir: str = '', - ): + def __init__(self, browsergym_eval_env: str | None = None): self.html_text_converter = self.get_html_text_converter() self.eval_mode = False self.eval_dir = '' - # EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation - self.browsergym_eval = browsergym_eval - self.browsergym_eval_save_dir = browsergym_eval_save_dir - if self.browsergym_eval: - assert ( - self.browsergym_eval_save_dir - ), 'browsergym_eval_save_dir must be provided for evaluation.' - self.eval_mode = True - self.eval_dir = os.path.join( - self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1] - ) - os.makedirs(self.eval_dir, exist_ok=True) + + # EVAL only: browsergym_eval_env must be provided for evaluation + self.browsergym_eval_env = browsergym_eval_env + self.eval_mode = bool(browsergym_eval_env) + # Initialize browser environment process multiprocessing.set_start_method('spawn', force=True) self.browser_side, self.agent_side = multiprocessing.Pipe() - try: - self.original_cwd = os.getcwd() - except FileNotFoundError: - logger.warning( - 'Current working directory does not exist. Using /tmp as fallback.' - ) - self.original_cwd = '/tmp' - os.chdir('/tmp') - self.init_browser() atexit.register(self.close) @@ -74,17 +56,6 @@ class BrowserEnv: ) def init_browser(self): logger.info('Starting browser env...') - - # Ensure we're in a valid directory before starting the process - try: - os.chdir(self.original_cwd) - logger.debug(f'Changed back to original directory: {self.original_cwd}') - except Exception as e: - logger.error(f'Failed to change to original directory: {e}') - # If we can't change to the original directory, try to use a known valid directory - os.chdir('/tmp') - logger.debug('Changed to /tmp directory as fallback') - try: self.process = multiprocessing.Process(target=self.browser_process) self.process.start() @@ -98,8 +69,17 @@ class BrowserEnv: def browser_process(self): if self.eval_mode: - logger.info('Creating browser env for evaluation purpose.') - env = gym.make(self.browsergym_eval) + assert self.browsergym_eval_env is not None + logger.info('Initializing browser env for web browsing evaluation.') + if 'webarena' in self.browsergym_eval_env: + import browsergym.webarena # noqa F401 register webarena tasks as gym environments + elif 'miniwob' in self.browsergym_eval_env: + import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments + else: + raise ValueError( + f'Unsupported browsergym eval env: {self.browsergym_eval_env}' + ) + env = gym.make(self.browsergym_eval_env) else: env = gym.make( 'browsergym/openended', @@ -108,20 +88,22 @@ class BrowserEnv: headless=True, disable_env_checker=True, ) + obs, info = env.reset() - # EVAL only: save the goal into file for evaluation + + # EVAL ONLY: save the goal into file for evaluation + self.eval_goal = None + self.eval_rewards: list[float] = [] if self.eval_mode: - rewards = [] # store rewards if in eval mode - logger.info(obs['goal']) - with open( - os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8' - ) as f: - f.write(obs['goal']) + logger.info(f"Browsing goal: {obs['goal']}") + self.eval_goal = obs['goal'] + logger.info('Browser env started.') while True: try: if self.browser_side.poll(timeout=0.01): unique_request_id, action_data = self.browser_side.recv() + # shutdown the browser environment if unique_request_id == 'SHUTDOWN': logger.info('SHUTDOWN recv, shutting down browser env...') @@ -130,17 +112,29 @@ class BrowserEnv: elif unique_request_id == 'IS_ALIVE': self.browser_side.send(('ALIVE', None)) continue + + # EVAL ONLY: Get evaluation info + if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION: + self.browser_side.send( + (unique_request_id, {'text_content': self.eval_goal}) + ) + continue + elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION: + self.browser_side.send( + ( + unique_request_id, + {'text_content': json.dumps(self.eval_rewards)}, + ) + ) + continue + action = action_data['action'] obs, reward, terminated, truncated, info = env.step(action) - # EVAL only: save the rewards into file for evaluation + + # EVAL ONLY: Save the rewards into file for evaluation if self.eval_mode: - rewards.append(reward) - with open( - os.path.join(self.eval_dir, 'rewards.json'), - 'w', - encoding='utf-8', - ) as f: - f.write(json.dumps(rewards)) + self.eval_rewards.append(reward) + # add text content of the page html_str = flatten_dom_to_str(obs['dom_object']) obs['text_content'] = self.html_text_converter.handle(html_str) @@ -158,6 +152,7 @@ class BrowserEnv: return def step(self, action_str: str, timeout: float = 30) -> dict: + """Execute an action in the browser environment and return the observation.""" unique_request_id = str(uuid.uuid4()) self.agent_side.send((unique_request_id, {'action': action_str})) start_time = time.time() diff --git a/opendevin/runtime/browser/utils.py b/opendevin/runtime/browser/utils.py index 8ca73eab0c..b456874da0 100644 --- a/opendevin/runtime/browser/utils.py +++ b/opendevin/runtime/browser/utils.py @@ -32,21 +32,23 @@ async def browse( obs = browser.step(action_str) return BrowserOutputObservation( content=obs['text_content'], # text content of the page - open_pages_urls=obs['open_pages_urls'], # list of open pages - active_page_index=obs['active_page_index'], # index of the active page - dom_object=obs['dom_object'], # DOM object - axtree_object=obs['axtree_object'], # accessibility tree object - extra_element_properties=obs[ - 'extra_element_properties' - ], # extra element properties - last_browser_action=obs['last_action'], # last browser env action performed - focused_element_bid=obs['focused_element_bid'], # focused element bid - screenshot=obs['screenshot'], # base64-encoded screenshot, png - url=obs['url'], # URL of the page - error=True if obs['last_action_error'] else False, # error flag - last_browser_action_error=obs[ - 'last_action_error' - ], # last browser env action error + url=obs.get('url', ''), # URL of the page + screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png + open_pages_urls=obs.get('open_pages_urls', []), # list of open pages + active_page_index=obs.get( + 'active_page_index', -1 + ), # index of the active page + dom_object=obs.get('dom_object', {}), # DOM object + axtree_object=obs.get('axtree_object', {}), # accessibility tree object + extra_element_properties=obs.get('extra_element_properties', {}), + focused_element_bid=obs.get( + 'focused_element_bid', None + ), # focused element bid + last_browser_action=obs.get( + 'last_action', '' + ), # last browser env action performed + last_browser_action_error=obs.get('last_action_error', ''), + error=True if obs.get('last_action_error', '') else False, # error flag ) except Exception as e: return BrowserOutputObservation( diff --git a/opendevin/runtime/client/client.py b/opendevin/runtime/client/client.py index d1cbc794ca..89422be92e 100644 --- a/opendevin/runtime/client/client.py +++ b/opendevin/runtime/client/client.py @@ -64,7 +64,12 @@ class RuntimeClient: """ def __init__( - self, plugins_to_load: list[Plugin], work_dir: str, username: str, user_id: int + self, + plugins_to_load: list[Plugin], + work_dir: str, + username: str, + user_id: int, + browsergym_eval_env: str | None, ) -> None: self.plugins_to_load = plugins_to_load self.username = username @@ -74,7 +79,7 @@ class RuntimeClient: self._init_bash_shell(self.pwd, self.username) self.lock = asyncio.Lock() self.plugins: dict[str, Plugin] = {} - self.browser = BrowserEnv() + self.browser = BrowserEnv(browsergym_eval_env) async def ainit(self): for plugin in self.plugins_to_load: @@ -362,6 +367,12 @@ if __name__ == '__main__': '--username', type=str, help='User to run as', default='opendevin' ) parser.add_argument('--user-id', type=int, help='User ID to run as', default=1000) + parser.add_argument( + '--browsergym-eval-env', + type=str, + help='BrowserGym environment used for browser evaluation', + default=None, + ) # example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement args = parser.parse_args() @@ -382,6 +393,7 @@ if __name__ == '__main__': work_dir=args.working_dir, username=args.username, user_id=args.user_id, + browsergym_eval_env=args.browsergym_eval_env, ) await client.ainit() yield diff --git a/opendevin/runtime/client/runtime.py b/opendevin/runtime/client/runtime.py index 791e57250d..06a40f06f1 100644 --- a/opendevin/runtime/client/runtime.py +++ b/opendevin/runtime/client/runtime.py @@ -147,6 +147,12 @@ class EventStreamRuntime(Runtime): logger.info(f'run_as_devin: `{self.config.run_as_devin}`') + if self.config.sandbox.browsergym_eval_env is not None: + browsergym_arg = ( + f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}' + ) + else: + browsergym_arg = '' container = self.docker_client.containers.run( self.container_image, command=( @@ -156,7 +162,8 @@ class EventStreamRuntime(Runtime): f'--working-dir {sandbox_workspace_dir} ' f'{plugin_arg}' f'--username {"opendevin" if self.config.run_as_devin else "root"} ' - f'--user-id {self.config.sandbox.user_id}' + f'--user-id {self.config.sandbox.user_id} ' + f'{browsergym_arg}' ), network_mode=network_mode, ports=port_mapping, diff --git a/tests/unit/test_runtime.py b/tests/unit/test_runtime.py index db4da35bdd..9b4d842464 100644 --- a/tests/unit/test_runtime.py +++ b/tests/unit/test_runtime.py @@ -1,6 +1,7 @@ """Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox.""" import asyncio +import json import os import tempfile import time @@ -13,6 +14,7 @@ from opendevin.core.config import AppConfig, SandboxConfig, load_from_env from opendevin.core.logger import opendevin_logger as logger from opendevin.events import EventStream from opendevin.events.action import ( + BrowseInteractiveAction, BrowseURLAction, CmdRunAction, FileReadAction, @@ -29,6 +31,7 @@ from opendevin.events.observation import ( ) from opendevin.runtime.client.runtime import EventStreamRuntime from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement +from opendevin.runtime.runtime import Runtime from opendevin.runtime.server.runtime import ServerRuntime from opendevin.storage import get_file_store @@ -95,7 +98,8 @@ async def _load_runtime( run_as_devin: bool = True, enable_auto_lint: bool = False, container_image: str | None = None, -): + browsergym_eval_env: str | None = None, +) -> Runtime: sid = 'test' cli_session = 'main_test' # AgentSkills need to be initialized **before** Jupyter @@ -104,7 +108,10 @@ async def _load_runtime( config = AppConfig( workspace_base=temp_dir, workspace_mount_path=temp_dir, - sandbox=SandboxConfig(use_host_network=True), + sandbox=SandboxConfig( + use_host_network=True, + browsergym_eval_env=browsergym_eval_env, + ), ) load_from_env(config, os.environ) config.run_as_devin = run_as_devin @@ -120,7 +127,9 @@ async def _load_runtime( # NOTE: we will use the default container image specified in the config.sandbox # if it is an official od_runtime image. cur_container_image = config.sandbox.container_image - if 'od_runtime' not in cur_container_image: + if 'od_runtime' not in cur_container_image and cur_container_image not in { + 'xingyaoww/od-eval-miniwob:v1.0' + }: # a special exception list cur_container_image = 'ubuntu:22.04' logger.warning( f'`{config.sandbox.container_image}` is not an od_runtime image. Will use `{cur_container_image}` as the container image for testing.' @@ -387,7 +396,6 @@ async def test_simple_browse(temp_dir, box_class, run_as_devin): assert isinstance(obs, BrowserOutputObservation) assert 'http://localhost:8000' in obs.url - assert obs.status_code == 200 assert not obs.error assert obs.open_pages_urls == ['http://localhost:8000/'] assert obs.active_page_index == 0 @@ -407,6 +415,53 @@ async def test_simple_browse(temp_dir, box_class, run_as_devin): await asyncio.sleep(1) +@pytest.mark.asyncio +async def test_browsergym_eval_env(temp_dir): + runtime = await _load_runtime( + temp_dir, + # only supported in event stream runtime + box_class=EventStreamRuntime, + run_as_devin=False, # need root permission to access file + container_image='xingyaoww/od-eval-miniwob:v1.0', + browsergym_eval_env='browsergym/miniwob.choose-list', + ) + from opendevin.runtime.browser.browser_env import ( + BROWSER_EVAL_GET_GOAL_ACTION, + BROWSER_EVAL_GET_REWARDS_ACTION, + ) + + # Test browse + action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = await runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + assert isinstance(obs, BrowserOutputObservation) + assert not obs.error + assert 'Select' in obs.content + assert 'from the list and click Submit' in obs.content + + # Make sure the browser can produce observation in eva[l + action = BrowseInteractiveAction(browser_actions='noop()') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = await runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert ( + obs.url.strip() + == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html' + ) + + # Make sure the rewards are working + action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = await runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert json.loads(obs.content) == [0.0] + + await runtime.close() + await asyncio.sleep(1) + + @pytest.mark.asyncio async def test_single_multiline_command(temp_dir, box_class): runtime = await _load_runtime(temp_dir, box_class)