mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-10 07:18:10 -05:00
Improve browsing agent prompts, allowing agent to properly finish when done (#1993)
* improve browsing agent, allowing it to properly finish. * handle parsing error, show user what the agent's browsing thoughts in the front end --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
@@ -9,7 +9,7 @@ Note that for browsing tasks, GPT-4 is usually a requirement to get reasonable r
|
||||
|
||||
```
|
||||
poetry run python ./opendevin/core/main.py \
|
||||
-i 5 \
|
||||
-i 10 \
|
||||
-t "tell me the usa's president using google search" \
|
||||
-c BrowsingAgent \
|
||||
-m gpt-4o-2024-05-13
|
||||
|
||||
@@ -8,6 +8,7 @@ from opendevin.controller.state.state import State
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
AgentFinishAction,
|
||||
BrowseInteractiveAction,
|
||||
MessageAction,
|
||||
)
|
||||
@@ -19,12 +20,17 @@ from opendevin.runtime.plugins import (
|
||||
|
||||
|
||||
def parse_response(response: str) -> Action:
|
||||
if '```' not in response:
|
||||
# unexpected response format, message back to user
|
||||
return MessageAction(response)
|
||||
thought = response.split('```')[0].strip()
|
||||
action_str = response.split('```')[1].strip()
|
||||
if 'send_msg_to_user(' in action_str:
|
||||
tree = ast.parse(action_str)
|
||||
args = tree.body[0].value.args # type: ignore
|
||||
return MessageAction(args[0].value)
|
||||
# handle send message to user function call in BrowserGym
|
||||
for sub_action in action_str.split('\n'):
|
||||
if 'send_msg_to_user(' in sub_action:
|
||||
tree = ast.parse(sub_action)
|
||||
args = tree.body[0].value.args # type: ignore
|
||||
return MessageAction(args[0].value)
|
||||
|
||||
return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
|
||||
|
||||
@@ -83,6 +89,25 @@ class BrowsingAgent(Agent):
|
||||
"""
|
||||
goal = state.get_current_user_intent()
|
||||
messages = []
|
||||
prev_actions = ''
|
||||
cur_axtree_txt = ''
|
||||
error_prefix = ''
|
||||
last_obs = None
|
||||
for prev_action, obs in state.history:
|
||||
if isinstance(prev_action, BrowseInteractiveAction):
|
||||
prev_actions += f'{prev_action.browser_actions}\n'
|
||||
last_obs = obs
|
||||
elif (
|
||||
isinstance(prev_action, MessageAction) and prev_action.source != 'user'
|
||||
):
|
||||
# agent has responded, task finish.
|
||||
return AgentFinishAction()
|
||||
|
||||
if isinstance(last_obs, BrowserOutputObservation):
|
||||
if last_obs.error:
|
||||
# add error recovery prompt prefix
|
||||
error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
|
||||
cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
|
||||
|
||||
system_msg = f"""\
|
||||
# Instructions
|
||||
@@ -96,21 +121,8 @@ and executed by a program, make sure to follow the formatting instructions.
|
||||
# Action Space
|
||||
{self.action_space.describe(with_long_description=False, with_examples=True)}
|
||||
"""
|
||||
messages.append({'role': 'system', 'content': system_msg})
|
||||
prev_actions = ''
|
||||
cur_axtree_txt = ''
|
||||
error_prefix = ''
|
||||
last_obs = None
|
||||
for prev_action, obs in state.history:
|
||||
if isinstance(prev_action, BrowseInteractiveAction):
|
||||
prev_actions += f'{prev_action.browser_actions}\n'
|
||||
last_obs = obs
|
||||
|
||||
if isinstance(last_obs, BrowserOutputObservation):
|
||||
if last_obs.error:
|
||||
# add error recovery prompt prefix
|
||||
error_prefix = f'Last action failed:\n{last_obs.last_browser_action}\nTry again with the current state of the page.\n'
|
||||
cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
|
||||
messages.append({'role': 'system', 'content': system_msg})
|
||||
|
||||
prompt = f"""\
|
||||
{error_prefix}
|
||||
@@ -126,7 +138,7 @@ Here is an example with chain of thought of a valid action when clicking on a bu
|
||||
In order to accomplish my goal I need to click on the button with bid 12
|
||||
```click("12")```
|
||||
"
|
||||
"""
|
||||
""".strip()
|
||||
messages.append({'role': 'user', 'content': prompt})
|
||||
response = self.llm.completion(
|
||||
messages=messages,
|
||||
|
||||
@@ -16,8 +16,12 @@ const messageActions = {
|
||||
const { url, screenshotSrc } = message.args;
|
||||
store.dispatch(setUrl(url));
|
||||
store.dispatch(setScreenshotSrc(screenshotSrc));
|
||||
store.dispatch(addAssistantMessage(message.message));
|
||||
},
|
||||
[ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
|
||||
if (message.args.thought) {
|
||||
store.dispatch(addAssistantMessage(message.args.thought));
|
||||
}
|
||||
const { url, screenshotSrc } = message.args;
|
||||
store.dispatch(setUrl(url));
|
||||
store.dispatch(setScreenshotSrc(screenshotSrc));
|
||||
|
||||
Reference in New Issue
Block a user