feat(agent): add BrowseURLAction to CodeAct (produce markdown from URL) (#5285)

This commit is contained in:
Xingyao Wang
2024-11-27 15:55:57 -06:00
committed by GitHub
parent f0ca2239f3
commit 4d3b035e00
9 changed files with 109 additions and 51 deletions

View File

@@ -19,6 +19,7 @@ from openhands.events.action import (
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
@@ -266,6 +267,30 @@ StrReplaceEditorTool = ChatCompletionToolParam(
),
)
_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
"""
WebReadTool = ChatCompletionToolParam(
type='function',
function=ChatCompletionToolParamFunctionChunk(
name='web_read',
description=_WEB_DESCRIPTION,
parameters={
'type': 'object',
'properties': {
'url': {
'type': 'string',
'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
}
},
'required': ['url'],
},
),
)
# from browsergym/core/action/highlevel.py
_browser_action_space = HighLevelActionSet(
subsets=['bid', 'nav'],
@@ -274,7 +299,7 @@ _browser_action_space = HighLevelActionSet(
)
_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.
See the description of "code" parameter for more details.
@@ -484,6 +509,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
action = IPythonRunCellAction(code=code, include_extra=False)
elif tool_call.function.name == 'browser':
action = BrowseInteractiveAction(browser_actions=arguments['code'])
elif tool_call.function.name == 'web_read':
action = BrowseURLAction(url=arguments['url'])
else:
raise FunctionCallNotExistsError(
f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
@@ -516,6 +543,7 @@ def get_tools(
) -> list[ChatCompletionToolParam]:
tools = [CmdRunTool, FinishTool]
if codeact_enable_browsing:
tools.append(WebReadTool)
tools.append(BrowserTool)
if codeact_enable_jupyter:
tools.append(IPythonTool)