From 53f64ffa0661ff0de1f944042b11da5ecbcd8307 Mon Sep 17 00:00:00 2001 From: Frank Xu Date: Fri, 24 May 2024 03:02:19 -0400 Subject: [PATCH] Improve browsing agent prompts, allowing agent to properly finish when done (#1993) * improve browsing agent, allowing it to properly finish. * handle parsing error, show user what the agent's browsing thoughts in the front end --------- Co-authored-by: Boxuan Li --- agenthub/browsing_agent/README.md | 2 +- agenthub/browsing_agent/browsing_agent.py | 50 ++++++++++++++--------- frontend/src/services/actions.ts | 4 ++ 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/agenthub/browsing_agent/README.md b/agenthub/browsing_agent/README.md index 7391e7fd6d..4b7c05b68a 100644 --- a/agenthub/browsing_agent/README.md +++ b/agenthub/browsing_agent/README.md @@ -9,7 +9,7 @@ Note that for browsing tasks, GPT-4 is usually a requirement to get reasonable r ``` poetry run python ./opendevin/core/main.py \ - -i 5 \ + -i 10 \ -t "tell me the usa's president using google search" \ -c BrowsingAgent \ -m gpt-4o-2024-05-13 diff --git a/agenthub/browsing_agent/browsing_agent.py b/agenthub/browsing_agent/browsing_agent.py index 3f455300a2..2bce4d804f 100644 --- a/agenthub/browsing_agent/browsing_agent.py +++ b/agenthub/browsing_agent/browsing_agent.py @@ -8,6 +8,7 @@ from opendevin.controller.state.state import State from opendevin.core.logger import opendevin_logger as logger from opendevin.events.action import ( Action, + AgentFinishAction, BrowseInteractiveAction, MessageAction, ) @@ -19,12 +20,17 @@ from opendevin.runtime.plugins import ( def parse_response(response: str) -> Action: + if '```' not in response: + # unexpected response format, message back to user + return MessageAction(response) thought = response.split('```')[0].strip() action_str = response.split('```')[1].strip() - if 'send_msg_to_user(' in action_str: - tree = ast.parse(action_str) - args = tree.body[0].value.args # type: ignore - return MessageAction(args[0].value) + # handle send message to user function call in BrowserGym + for sub_action in action_str.split('\n'): + if 'send_msg_to_user(' in sub_action: + tree = ast.parse(sub_action) + args = tree.body[0].value.args # type: ignore + return MessageAction(args[0].value) return BrowseInteractiveAction(browser_actions=action_str, thought=thought) @@ -83,6 +89,25 @@ class BrowsingAgent(Agent): """ goal = state.get_current_user_intent() messages = [] + prev_actions = '' + cur_axtree_txt = '' + error_prefix = '' + last_obs = None + for prev_action, obs in state.history: + if isinstance(prev_action, BrowseInteractiveAction): + prev_actions += f'{prev_action.browser_actions}\n' + last_obs = obs + elif ( + isinstance(prev_action, MessageAction) and prev_action.source != 'user' + ): + # agent has responded, task finish. + return AgentFinishAction() + + if isinstance(last_obs, BrowserOutputObservation): + if last_obs.error: + # add error recovery prompt prefix + error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n' + cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object) system_msg = f"""\ # Instructions @@ -96,21 +121,8 @@ and executed by a program, make sure to follow the formatting instructions. # Action Space {self.action_space.describe(with_long_description=False, with_examples=True)} """ - messages.append({'role': 'system', 'content': system_msg}) - prev_actions = '' - cur_axtree_txt = '' - error_prefix = '' - last_obs = None - for prev_action, obs in state.history: - if isinstance(prev_action, BrowseInteractiveAction): - prev_actions += f'{prev_action.browser_actions}\n' - last_obs = obs - if isinstance(last_obs, BrowserOutputObservation): - if last_obs.error: - # add error recovery prompt prefix - error_prefix = f'Last action failed:\n{last_obs.last_browser_action}\nTry again with the current state of the page.\n' - cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object) + messages.append({'role': 'system', 'content': system_msg}) prompt = f"""\ {error_prefix} @@ -126,7 +138,7 @@ Here is an example with chain of thought of a valid action when clicking on a bu In order to accomplish my goal I need to click on the button with bid 12 ```click("12")``` " -""" +""".strip() messages.append({'role': 'user', 'content': prompt}) response = self.llm.completion( messages=messages, diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts index 83c4fd78eb..a67e34605e 100644 --- a/frontend/src/services/actions.ts +++ b/frontend/src/services/actions.ts @@ -16,8 +16,12 @@ const messageActions = { const { url, screenshotSrc } = message.args; store.dispatch(setUrl(url)); store.dispatch(setScreenshotSrc(screenshotSrc)); + store.dispatch(addAssistantMessage(message.message)); }, [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => { + if (message.args.thought) { + store.dispatch(addAssistantMessage(message.args.thought)); + } const { url, screenshotSrc } = message.args; store.dispatch(setUrl(url)); store.dispatch(setScreenshotSrc(screenshotSrc));