mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-01-09 14:57:59 -05:00
feat(agent, CodeAct 2.2): native CodeAct support for Browsing (#4667)
Co-authored-by: tofarr <tofarr@gmail.com>
This commit is contained in:
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
|
|||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
|
update_llm_config_for_completions_logging,
|
||||||
)
|
)
|
||||||
from openhands.controller.state.state import State
|
from openhands.controller.state.state import State
|
||||||
from openhands.core.config import (
|
from openhands.core.config import (
|
||||||
@@ -55,18 +56,14 @@ def get_config(
|
|||||||
workspace_base=None,
|
workspace_base=None,
|
||||||
workspace_mount_path=None,
|
workspace_mount_path=None,
|
||||||
)
|
)
|
||||||
if metadata.llm_config.log_completions:
|
config.set_llm_config(
|
||||||
metadata.llm_config.log_completions_folder = os.path.join(
|
update_llm_config_for_completions_logging(
|
||||||
metadata.eval_output_dir, 'llm_completions', instance_id
|
metadata.llm_config, metadata.eval_output_dir, instance_id
|
||||||
)
|
)
|
||||||
logger.info(
|
)
|
||||||
f'Logging LLM completions for instance {instance_id} to '
|
|
||||||
f'{metadata.llm_config.log_completions_folder}'
|
|
||||||
)
|
|
||||||
config.set_llm_config(metadata.llm_config)
|
|
||||||
agent_config = AgentConfig(
|
agent_config = AgentConfig(
|
||||||
codeact_enable_jupyter=True,
|
codeact_enable_jupyter=True,
|
||||||
codeact_enable_browsing_delegate=True,
|
codeact_enable_browsing=True,
|
||||||
codeact_enable_llm_editor=False,
|
codeact_enable_llm_editor=False,
|
||||||
)
|
)
|
||||||
config.set_agent_config(agent_config)
|
config.set_agent_config(agent_config)
|
||||||
|
|||||||
44
evaluation/integration_tests/tests/t06_github_pr_browsing.py
Normal file
44
evaluation/integration_tests/tests/t06_github_pr_browsing.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult
|
||||||
|
from openhands.events.action import AgentFinishAction, MessageAction
|
||||||
|
from openhands.events.event import Event
|
||||||
|
from openhands.events.observation import AgentDelegateObservation
|
||||||
|
from openhands.runtime.base import Runtime
|
||||||
|
|
||||||
|
|
||||||
|
class Test(BaseIntegrationTest):
|
||||||
|
INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def initialize_runtime(cls, runtime: Runtime) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
|
||||||
|
# check if the "The answer is OpenHands is all you need!" is in any message
|
||||||
|
message_actions = [
|
||||||
|
event
|
||||||
|
for event in histories
|
||||||
|
if isinstance(
|
||||||
|
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
for event in message_actions:
|
||||||
|
if isinstance(event, AgentDelegateObservation):
|
||||||
|
content = event.content
|
||||||
|
elif isinstance(event, AgentFinishAction):
|
||||||
|
content = event.outputs.get('content', '')
|
||||||
|
elif isinstance(event, MessageAction):
|
||||||
|
content = event.content
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Unknown event type: {type(event)}')
|
||||||
|
|
||||||
|
if (
|
||||||
|
'non-commercial' in content
|
||||||
|
or 'MIT' in content
|
||||||
|
or 'Apache 2.0' in content
|
||||||
|
):
|
||||||
|
return TestResult(success=True)
|
||||||
|
return TestResult(
|
||||||
|
success=False,
|
||||||
|
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
|
||||||
|
)
|
||||||
@@ -10,10 +10,12 @@ import pandas as pd
|
|||||||
from evaluation.utils.shared import (
|
from evaluation.utils.shared import (
|
||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
EvalOutput,
|
EvalOutput,
|
||||||
|
codeact_user_response,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
|
update_llm_config_for_completions_logging,
|
||||||
)
|
)
|
||||||
from openhands.controller.state.state import State
|
from openhands.controller.state.state import State
|
||||||
from openhands.core.config import (
|
from openhands.core.config import (
|
||||||
@@ -29,7 +31,10 @@ from openhands.events.action import (
|
|||||||
CmdRunAction,
|
CmdRunAction,
|
||||||
MessageAction,
|
MessageAction,
|
||||||
)
|
)
|
||||||
from openhands.events.observation import CmdOutputObservation
|
from openhands.events.observation import (
|
||||||
|
BrowserOutputObservation,
|
||||||
|
CmdOutputObservation,
|
||||||
|
)
|
||||||
from openhands.runtime.base import Runtime
|
from openhands.runtime.base import Runtime
|
||||||
from openhands.runtime.browser.browser_env import (
|
from openhands.runtime.browser.browser_env import (
|
||||||
BROWSER_EVAL_GET_GOAL_ACTION,
|
BROWSER_EVAL_GET_GOAL_ACTION,
|
||||||
@@ -37,7 +42,11 @@ from openhands.runtime.browser.browser_env import (
|
|||||||
)
|
)
|
||||||
from openhands.utils.async_utils import call_async_from_sync
|
from openhands.utils.async_utils import call_async_from_sync
|
||||||
|
|
||||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
|
||||||
|
|
||||||
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||||
|
'CodeActAgent': codeact_user_response,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_config(
|
def get_config(
|
||||||
@@ -47,25 +56,32 @@ def get_config(
|
|||||||
config = AppConfig(
|
config = AppConfig(
|
||||||
default_agent=metadata.agent_class,
|
default_agent=metadata.agent_class,
|
||||||
run_as_openhands=False,
|
run_as_openhands=False,
|
||||||
runtime='eventstream',
|
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
||||||
max_iterations=metadata.max_iterations,
|
max_iterations=metadata.max_iterations,
|
||||||
sandbox=SandboxConfig(
|
sandbox=SandboxConfig(
|
||||||
base_container_image='xingyaoww/od-eval-miniwob:v1.0',
|
base_container_image='xingyaoww/od-eval-miniwob:v1.0',
|
||||||
enable_auto_lint=True,
|
enable_auto_lint=True,
|
||||||
use_host_network=False,
|
use_host_network=False,
|
||||||
browsergym_eval_env=env_id,
|
browsergym_eval_env=env_id,
|
||||||
|
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||||
|
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||||
|
keep_remote_runtime_alive=False,
|
||||||
),
|
),
|
||||||
# do not mount workspace
|
# do not mount workspace
|
||||||
workspace_base=None,
|
workspace_base=None,
|
||||||
workspace_mount_path=None,
|
workspace_mount_path=None,
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(
|
||||||
|
update_llm_config_for_completions_logging(
|
||||||
|
metadata.llm_config, metadata.eval_output_dir, env_id
|
||||||
|
)
|
||||||
|
)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
def initialize_runtime(
|
def initialize_runtime(
|
||||||
runtime: Runtime,
|
runtime: Runtime,
|
||||||
) -> str:
|
) -> tuple[str, BrowserOutputObservation]:
|
||||||
"""Initialize the runtime for the agent.
|
"""Initialize the runtime for the agent.
|
||||||
|
|
||||||
This function is called before the runtime is used to run the agent.
|
This function is called before the runtime is used to run the agent.
|
||||||
@@ -85,8 +101,14 @@ def initialize_runtime(
|
|||||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||||
goal = obs.content
|
goal = obs.content
|
||||||
|
|
||||||
|
# Run noop to get the initial browser observation (e.g., the page URL & content)
|
||||||
|
action = BrowseInteractiveAction(browser_actions='noop(1000)')
|
||||||
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||||
|
obs = runtime.run_action(action)
|
||||||
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||||
|
|
||||||
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
||||||
return goal
|
return goal, obs
|
||||||
|
|
||||||
|
|
||||||
def complete_runtime(
|
def complete_runtime(
|
||||||
@@ -117,7 +139,7 @@ def process_instance(
|
|||||||
metadata: EvalMetadata,
|
metadata: EvalMetadata,
|
||||||
reset_logger: bool = True,
|
reset_logger: bool = True,
|
||||||
) -> EvalOutput:
|
) -> EvalOutput:
|
||||||
env_id = instance.id
|
env_id = instance.instance_id
|
||||||
config = get_config(metadata, env_id)
|
config = get_config(metadata, env_id)
|
||||||
|
|
||||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||||
@@ -129,7 +151,12 @@ def process_instance(
|
|||||||
|
|
||||||
runtime = create_runtime(config)
|
runtime = create_runtime(config)
|
||||||
call_async_from_sync(runtime.connect)
|
call_async_from_sync(runtime.connect)
|
||||||
task_str = initialize_runtime(runtime)
|
task_str, obs = initialize_runtime(runtime)
|
||||||
|
|
||||||
|
task_str += (
|
||||||
|
f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}'
|
||||||
|
)
|
||||||
|
|
||||||
state: State | None = asyncio.run(
|
state: State | None = asyncio.run(
|
||||||
run_controller(
|
run_controller(
|
||||||
config=config,
|
config=config,
|
||||||
@@ -137,6 +164,9 @@ def process_instance(
|
|||||||
content=task_str
|
content=task_str
|
||||||
), # take output from initialize_runtime
|
), # take output from initialize_runtime
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||||
|
metadata.agent_class
|
||||||
|
],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -159,7 +189,7 @@ def process_instance(
|
|||||||
|
|
||||||
return_val = complete_runtime(runtime)
|
return_val = complete_runtime(runtime)
|
||||||
logger.info(f'Return value from complete_runtime: {return_val}')
|
logger.info(f'Return value from complete_runtime: {return_val}')
|
||||||
reward = max(return_val['rewards'])
|
reward = max(return_val['rewards'], default=0)
|
||||||
|
|
||||||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
||||||
# for compatibility with the existing output format, we can remake the pairs here
|
# for compatibility with the existing output format, we can remake the pairs here
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
|
|||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
|
update_llm_config_for_completions_logging,
|
||||||
)
|
)
|
||||||
from openhands.controller.state.state import State
|
from openhands.controller.state.state import State
|
||||||
from openhands.core.config import (
|
from openhands.core.config import (
|
||||||
@@ -76,15 +77,13 @@ def get_config(
|
|||||||
workspace_base=None,
|
workspace_base=None,
|
||||||
workspace_mount_path=None,
|
workspace_mount_path=None,
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(
|
||||||
if metadata.llm_config.log_completions:
|
update_llm_config_for_completions_logging(
|
||||||
metadata.llm_config.log_completions_folder = os.path.join(
|
metadata.llm_config,
|
||||||
metadata.eval_output_dir, 'llm_completions', instance_id
|
metadata.eval_output_dir,
|
||||||
)
|
instance_id,
|
||||||
logger.info(
|
|
||||||
f'Logging LLM completions for instance {instance_id} to '
|
|
||||||
f'{metadata.llm_config.log_completions_folder}'
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
|
|||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
|
update_llm_config_for_completions_logging,
|
||||||
)
|
)
|
||||||
from openhands.controller.state.state import State
|
from openhands.controller.state.state import State
|
||||||
from openhands.core.config import (
|
from openhands.core.config import (
|
||||||
@@ -40,6 +41,7 @@ from openhands.utils.async_utils import call_async_from_sync
|
|||||||
|
|
||||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
||||||
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
|
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
|
||||||
|
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
||||||
|
|
||||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||||
'CodeActAgent': codeact_user_response,
|
'CodeActAgent': codeact_user_response,
|
||||||
@@ -88,6 +90,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
|||||||
'5. Think about edgecases and make sure your fix handles them as well\n'
|
'5. Think about edgecases and make sure your fix handles them as well\n'
|
||||||
"Your thinking should be thorough and so it's fine if it's very long.\n"
|
"Your thinking should be thorough and so it's fine if it's very long.\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if RUN_WITH_BROWSING:
|
||||||
|
instruction += (
|
||||||
|
'<IMPORTANT!>\n'
|
||||||
|
'You SHOULD NEVER attempt to browse the web. '
|
||||||
|
'</IMPORTANT!>\n'
|
||||||
|
)
|
||||||
return instruction
|
return instruction
|
||||||
|
|
||||||
|
|
||||||
@@ -142,18 +151,14 @@ def get_config(
|
|||||||
workspace_base=None,
|
workspace_base=None,
|
||||||
workspace_mount_path=None,
|
workspace_mount_path=None,
|
||||||
)
|
)
|
||||||
if metadata.llm_config.log_completions:
|
config.set_llm_config(
|
||||||
metadata.llm_config.log_completions_folder = os.path.join(
|
update_llm_config_for_completions_logging(
|
||||||
metadata.eval_output_dir, 'llm_completions', instance['instance_id']
|
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
||||||
)
|
)
|
||||||
logger.info(
|
)
|
||||||
f'Logging LLM completions for instance {instance["instance_id"]} to '
|
|
||||||
f'{metadata.llm_config.log_completions_folder}'
|
|
||||||
)
|
|
||||||
config.set_llm_config(metadata.llm_config)
|
|
||||||
agent_config = AgentConfig(
|
agent_config = AgentConfig(
|
||||||
codeact_enable_jupyter=False,
|
codeact_enable_jupyter=False,
|
||||||
codeact_enable_browsing_delegate=False,
|
codeact_enable_browsing=RUN_WITH_BROWSING,
|
||||||
codeact_enable_llm_editor=False,
|
codeact_enable_llm_editor=False,
|
||||||
)
|
)
|
||||||
config.set_agent_config(agent_config)
|
config.set_agent_config(agent_config)
|
||||||
|
|||||||
@@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then
|
|||||||
USE_INSTANCE_IMAGE=true
|
USE_INSTANCE_IMAGE=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -z "$RUN_WITH_BROWSING" ]; then
|
||||||
|
echo "RUN_WITH_BROWSING not specified, use default false"
|
||||||
|
RUN_WITH_BROWSING=false
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
if [ -z "$DATASET" ]; then
|
if [ -z "$DATASET" ]; then
|
||||||
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
|
echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
|
||||||
@@ -47,6 +52,8 @@ fi
|
|||||||
|
|
||||||
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
|
export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
|
||||||
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
|
||||||
|
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
|
||||||
|
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
|
||||||
|
|
||||||
get_agent_version
|
get_agent_version
|
||||||
|
|
||||||
@@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then
|
|||||||
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
EVAL_NOTE="$EVAL_NOTE-no-hint"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$RUN_WITH_BROWSING" = true ]; then
|
||||||
|
EVAL_NOTE="$EVAL_NOTE-with-browsing"
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -n "$EXP_NAME" ]; then
|
if [ -n "$EXP_NAME" ]; then
|
||||||
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
|
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -411,3 +411,20 @@ def reset_logger_for_multiprocessing(
|
|||||||
)
|
)
|
||||||
file_handler.setLevel(logging.INFO)
|
file_handler.setLevel(logging.INFO)
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
|
||||||
|
def update_llm_config_for_completions_logging(
|
||||||
|
llm_config: LLMConfig,
|
||||||
|
eval_output_dir: str,
|
||||||
|
instance_id: str,
|
||||||
|
) -> LLMConfig:
|
||||||
|
"""Update the LLM config for logging completions."""
|
||||||
|
if llm_config.log_completions:
|
||||||
|
llm_config.log_completions_folder = os.path.join(
|
||||||
|
eval_output_dir, 'llm_completions', instance_id
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f'Logging LLM completions for instance {instance_id} to '
|
||||||
|
f'{llm_config.log_completions_folder}'
|
||||||
|
)
|
||||||
|
return llm_config
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from openhands.events.action import (
|
|||||||
Action,
|
Action,
|
||||||
AgentDelegateAction,
|
AgentDelegateAction,
|
||||||
AgentFinishAction,
|
AgentFinishAction,
|
||||||
|
BrowseInteractiveAction,
|
||||||
CmdRunAction,
|
CmdRunAction,
|
||||||
FileEditAction,
|
FileEditAction,
|
||||||
IPythonRunCellAction,
|
IPythonRunCellAction,
|
||||||
@@ -23,6 +24,7 @@ from openhands.events.action import (
|
|||||||
)
|
)
|
||||||
from openhands.events.observation import (
|
from openhands.events.observation import (
|
||||||
AgentDelegateObservation,
|
AgentDelegateObservation,
|
||||||
|
BrowserOutputObservation,
|
||||||
CmdOutputObservation,
|
CmdOutputObservation,
|
||||||
FileEditObservation,
|
FileEditObservation,
|
||||||
IPythonRunCellObservation,
|
IPythonRunCellObservation,
|
||||||
@@ -42,7 +44,7 @@ from openhands.utils.prompt import PromptManager
|
|||||||
|
|
||||||
|
|
||||||
class CodeActAgent(Agent):
|
class CodeActAgent(Agent):
|
||||||
VERSION = '2.1'
|
VERSION = '2.2'
|
||||||
"""
|
"""
|
||||||
The Code Act Agent is a minimalist agent.
|
The Code Act Agent is a minimalist agent.
|
||||||
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
|
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
|
||||||
@@ -105,7 +107,7 @@ class CodeActAgent(Agent):
|
|||||||
if self.function_calling_active:
|
if self.function_calling_active:
|
||||||
# Function calling mode
|
# Function calling mode
|
||||||
self.tools = codeact_function_calling.get_tools(
|
self.tools = codeact_function_calling.get_tools(
|
||||||
codeact_enable_browsing_delegate=self.config.codeact_enable_browsing_delegate,
|
codeact_enable_browsing=self.config.codeact_enable_browsing,
|
||||||
codeact_enable_jupyter=self.config.codeact_enable_jupyter,
|
codeact_enable_jupyter=self.config.codeact_enable_jupyter,
|
||||||
codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
|
codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
|
||||||
)
|
)
|
||||||
@@ -142,10 +144,10 @@ class CodeActAgent(Agent):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
action (Action): The action to convert. Can be one of:
|
action (Action): The action to convert. Can be one of:
|
||||||
- AgentDelegateAction: For delegating tasks to other agents
|
|
||||||
- CmdRunAction: For executing bash commands
|
- CmdRunAction: For executing bash commands
|
||||||
- IPythonRunCellAction: For running IPython code
|
- IPythonRunCellAction: For running IPython code
|
||||||
- FileEditAction: For editing files
|
- FileEditAction: For editing files
|
||||||
|
- BrowseInteractiveAction: For browsing the web
|
||||||
- AgentFinishAction: For ending the interaction
|
- AgentFinishAction: For ending the interaction
|
||||||
- MessageAction: For sending messages
|
- MessageAction: For sending messages
|
||||||
pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
|
pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
|
||||||
@@ -169,6 +171,7 @@ class CodeActAgent(Agent):
|
|||||||
CmdRunAction,
|
CmdRunAction,
|
||||||
IPythonRunCellAction,
|
IPythonRunCellAction,
|
||||||
FileEditAction,
|
FileEditAction,
|
||||||
|
BrowseInteractiveAction,
|
||||||
),
|
),
|
||||||
) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
|
) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
|
||||||
if self.function_calling_active:
|
if self.function_calling_active:
|
||||||
@@ -192,6 +195,10 @@ class CodeActAgent(Agent):
|
|||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
|
assert not isinstance(action, BrowseInteractiveAction), (
|
||||||
|
'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
|
||||||
|
+ str(action)
|
||||||
|
)
|
||||||
content = [TextContent(text=self.action_parser.action_to_str(action))]
|
content = [TextContent(text=self.action_parser.action_to_str(action))]
|
||||||
return [
|
return [
|
||||||
Message(
|
Message(
|
||||||
@@ -266,6 +273,12 @@ class CodeActAgent(Agent):
|
|||||||
elif isinstance(obs, FileEditObservation):
|
elif isinstance(obs, FileEditObservation):
|
||||||
text = obs_prefix + truncate_content(str(obs), max_message_chars)
|
text = obs_prefix + truncate_content(str(obs), max_message_chars)
|
||||||
message = Message(role='user', content=[TextContent(text=text)])
|
message = Message(role='user', content=[TextContent(text=text)])
|
||||||
|
elif isinstance(obs, BrowserOutputObservation):
|
||||||
|
text = obs.get_agent_obs_text()
|
||||||
|
message = Message(
|
||||||
|
role='user',
|
||||||
|
content=[TextContent(text=obs_prefix + text)],
|
||||||
|
)
|
||||||
elif isinstance(obs, AgentDelegateObservation):
|
elif isinstance(obs, AgentDelegateObservation):
|
||||||
text = obs_prefix + truncate_content(
|
text = obs_prefix + truncate_content(
|
||||||
obs.outputs['content'] if 'content' in obs.outputs else '',
|
obs.outputs['content'] if 'content' in obs.outputs else '',
|
||||||
@@ -335,6 +348,7 @@ class CodeActAgent(Agent):
|
|||||||
}
|
}
|
||||||
if self.function_calling_active:
|
if self.function_calling_active:
|
||||||
params['tools'] = self.tools
|
params['tools'] = self.tools
|
||||||
|
params['parallel_tool_calls'] = False
|
||||||
else:
|
else:
|
||||||
params['stop'] = [
|
params['stop'] = [
|
||||||
'</execute_ipython>',
|
'</execute_ipython>',
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ This is similar to the functionality of `CodeActResponseParser`.
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from browsergym.core.action.highlevel import HighLevelActionSet
|
||||||
from litellm import (
|
from litellm import (
|
||||||
ChatCompletionToolParam,
|
ChatCompletionToolParam,
|
||||||
ChatCompletionToolParamFunctionChunk,
|
ChatCompletionToolParamFunctionChunk,
|
||||||
@@ -16,6 +17,7 @@ from openhands.events.action import (
|
|||||||
Action,
|
Action,
|
||||||
AgentDelegateAction,
|
AgentDelegateAction,
|
||||||
AgentFinishAction,
|
AgentFinishAction,
|
||||||
|
BrowseInteractiveAction,
|
||||||
CmdRunAction,
|
CmdRunAction,
|
||||||
FileEditAction,
|
FileEditAction,
|
||||||
IPythonRunCellAction,
|
IPythonRunCellAction,
|
||||||
@@ -272,24 +274,146 @@ StrReplaceEditorTool = ChatCompletionToolParam(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
_BROWSER_DELEGATION = """Delegate the task to another browsing agent.
|
# from browsergym/core/action/highlevel.py
|
||||||
The assistant should delegate the task if it needs to browse the Internet.
|
_browser_action_space = HighLevelActionSet(
|
||||||
|
subsets=['bid', 'nav'],
|
||||||
|
strict=False, # less strict on the parsing of the actions
|
||||||
|
multiaction=True, # enable to agent to take multiple actions at once
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
|
||||||
|
The following 15 functions are available. Nothing else is supported.
|
||||||
|
|
||||||
|
goto(url: str)
|
||||||
|
Description: Navigate to a url.
|
||||||
|
Examples:
|
||||||
|
goto('http://www.example.com')
|
||||||
|
|
||||||
|
go_back()
|
||||||
|
Description: Navigate to the previous page in history.
|
||||||
|
Examples:
|
||||||
|
go_back()
|
||||||
|
|
||||||
|
go_forward()
|
||||||
|
Description: Navigate to the next page in history.
|
||||||
|
Examples:
|
||||||
|
go_forward()
|
||||||
|
|
||||||
|
noop(wait_ms: float = 1000)
|
||||||
|
Description: Do nothing, and optionally wait for the given time (in milliseconds).
|
||||||
|
You can use this to get the current page content and/or wait for the page to load.
|
||||||
|
Examples:
|
||||||
|
noop()
|
||||||
|
|
||||||
|
noop(500)
|
||||||
|
|
||||||
|
scroll(delta_x: float, delta_y: float)
|
||||||
|
Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
|
||||||
|
Examples:
|
||||||
|
scroll(0, 200)
|
||||||
|
|
||||||
|
scroll(-50.2, -100.5)
|
||||||
|
|
||||||
|
fill(bid: str, value: str)
|
||||||
|
Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
|
||||||
|
Examples:
|
||||||
|
fill('237', 'example value')
|
||||||
|
|
||||||
|
fill('45', 'multi-line\nexample')
|
||||||
|
|
||||||
|
fill('a12', 'example with "quotes"')
|
||||||
|
|
||||||
|
select_option(bid: str, options: str | list[str])
|
||||||
|
Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
|
||||||
|
Examples:
|
||||||
|
select_option('a48', 'blue')
|
||||||
|
|
||||||
|
select_option('c48', ['red', 'green', 'blue'])
|
||||||
|
|
||||||
|
click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
|
||||||
|
Description: Click an element.
|
||||||
|
Examples:
|
||||||
|
click('a51')
|
||||||
|
|
||||||
|
click('b22', button='right')
|
||||||
|
|
||||||
|
click('48', button='middle', modifiers=['Shift'])
|
||||||
|
|
||||||
|
dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
|
||||||
|
Description: Double click an element.
|
||||||
|
Examples:
|
||||||
|
dblclick('12')
|
||||||
|
|
||||||
|
dblclick('ca42', button='right')
|
||||||
|
|
||||||
|
dblclick('178', button='middle', modifiers=['Shift'])
|
||||||
|
|
||||||
|
hover(bid: str)
|
||||||
|
Description: Hover over an element.
|
||||||
|
Examples:
|
||||||
|
hover('b8')
|
||||||
|
|
||||||
|
press(bid: str, key_comb: str)
|
||||||
|
Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
|
||||||
|
Examples:
|
||||||
|
press('88', 'Backspace')
|
||||||
|
|
||||||
|
press('a26', 'ControlOrMeta+a')
|
||||||
|
|
||||||
|
press('a61', 'Meta+Shift+t')
|
||||||
|
|
||||||
|
focus(bid: str)
|
||||||
|
Description: Focus the matching element.
|
||||||
|
Examples:
|
||||||
|
focus('b455')
|
||||||
|
|
||||||
|
clear(bid: str)
|
||||||
|
Description: Clear the input field.
|
||||||
|
Examples:
|
||||||
|
clear('996')
|
||||||
|
|
||||||
|
drag_and_drop(from_bid: str, to_bid: str)
|
||||||
|
Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
|
||||||
|
Examples:
|
||||||
|
drag_and_drop('56', '498')
|
||||||
|
|
||||||
|
upload_file(bid: str, file: str | list[str])
|
||||||
|
Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
|
||||||
|
Examples:
|
||||||
|
upload_file('572', '/home/user/my_receipt.pdf')
|
||||||
|
|
||||||
|
upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
|
||||||
|
|
||||||
|
Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
|
||||||
|
More than 2-3 actions usually leads to failure or unexpected behavior. Example:
|
||||||
|
fill('a12', 'example with "quotes"')
|
||||||
|
click('a51')
|
||||||
|
click('48', button='middle', modifiers=['Shift'])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BrowserDelegationTool = ChatCompletionToolParam(
|
for _, action in _browser_action_space.action_set.items():
|
||||||
|
assert (
|
||||||
|
action.signature in _BROWSER_DESCRIPTION
|
||||||
|
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
|
||||||
|
assert (
|
||||||
|
action.description in _BROWSER_DESCRIPTION
|
||||||
|
), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
|
||||||
|
|
||||||
|
BrowserTool = ChatCompletionToolParam(
|
||||||
type='function',
|
type='function',
|
||||||
function=ChatCompletionToolParamFunctionChunk(
|
function=ChatCompletionToolParamFunctionChunk(
|
||||||
name='delegate_to_browsing_agent',
|
name='browser',
|
||||||
description=_BROWSER_DELEGATION,
|
description=_BROWSER_DESCRIPTION,
|
||||||
parameters={
|
parameters={
|
||||||
'type': 'object',
|
'type': 'object',
|
||||||
'properties': {
|
'properties': {
|
||||||
'task': {
|
'code': {
|
||||||
'type': 'string',
|
'type': 'string',
|
||||||
'description': 'The task for the browsing agent to execute. It should include all the necessary context and specify what information the browsing agent should return.',
|
'description': 'The Python code that interacts with the browser.',
|
||||||
},
|
}
|
||||||
},
|
},
|
||||||
'required': ['task'],
|
'required': ['code'],
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@@ -357,6 +481,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
|
|||||||
f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
|
f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
|
||||||
)
|
)
|
||||||
action = IPythonRunCellAction(code=code, include_extra=False)
|
action = IPythonRunCellAction(code=code, include_extra=False)
|
||||||
|
elif tool_call.function.name == 'browser':
|
||||||
|
action = BrowseInteractiveAction(browser_actions=arguments['code'])
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
|
raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
|
||||||
|
|
||||||
@@ -381,13 +507,13 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
|
|||||||
|
|
||||||
|
|
||||||
def get_tools(
|
def get_tools(
|
||||||
codeact_enable_browsing_delegate: bool = False,
|
codeact_enable_browsing: bool = False,
|
||||||
codeact_enable_llm_editor: bool = False,
|
codeact_enable_llm_editor: bool = False,
|
||||||
codeact_enable_jupyter: bool = False,
|
codeact_enable_jupyter: bool = False,
|
||||||
) -> list[ChatCompletionToolParam]:
|
) -> list[ChatCompletionToolParam]:
|
||||||
tools = [CmdRunTool, FinishTool]
|
tools = [CmdRunTool, FinishTool]
|
||||||
if codeact_enable_browsing_delegate:
|
if codeact_enable_browsing:
|
||||||
tools.append(BrowserDelegationTool)
|
tools.append(BrowserTool)
|
||||||
if codeact_enable_jupyter:
|
if codeact_enable_jupyter:
|
||||||
tools.append(IPythonTool)
|
tools.append(IPythonTool)
|
||||||
if codeact_enable_llm_editor:
|
if codeact_enable_llm_editor:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ class AgentConfig:
|
|||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
function_calling: Whether function calling is enabled. Default is True.
|
function_calling: Whether function calling is enabled. Default is True.
|
||||||
codeact_enable_browsing_delegate: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
|
codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
|
||||||
codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
|
codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
|
||||||
codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
|
codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
|
||||||
micro_agent_name: The name of the micro agent to use for this agent.
|
micro_agent_name: The name of the micro agent to use for this agent.
|
||||||
@@ -19,7 +19,7 @@ class AgentConfig:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
function_calling: bool = True
|
function_calling: bool = True
|
||||||
codeact_enable_browsing_delegate: bool = True
|
codeact_enable_browsing: bool = True
|
||||||
codeact_enable_llm_editor: bool = False
|
codeact_enable_llm_editor: bool = False
|
||||||
codeact_enable_jupyter: bool = True
|
codeact_enable_jupyter: bool = True
|
||||||
micro_agent_name: str | None = None
|
micro_agent_name: str | None = None
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from browsergym.utils.obs import flatten_axtree_to_str
|
||||||
|
|
||||||
from openhands.core.schema import ObservationType
|
from openhands.core.schema import ObservationType
|
||||||
from openhands.events.observation.observation import Observation
|
from openhands.events.observation.observation import Observation
|
||||||
|
|
||||||
@@ -29,7 +31,7 @@ class BrowserOutputObservation(Observation):
|
|||||||
return 'Visited ' + self.url
|
return 'Visited ' + self.url
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return (
|
ret = (
|
||||||
'**BrowserOutputObservation**\n'
|
'**BrowserOutputObservation**\n'
|
||||||
f'URL: {self.url}\n'
|
f'URL: {self.url}\n'
|
||||||
f'Error: {self.error}\n'
|
f'Error: {self.error}\n'
|
||||||
@@ -38,5 +40,47 @@ class BrowserOutputObservation(Observation):
|
|||||||
f'Last browser action: {self.last_browser_action}\n'
|
f'Last browser action: {self.last_browser_action}\n'
|
||||||
f'Last browser action error: {self.last_browser_action_error}\n'
|
f'Last browser action error: {self.last_browser_action_error}\n'
|
||||||
f'Focused element bid: {self.focused_element_bid}\n'
|
f'Focused element bid: {self.focused_element_bid}\n'
|
||||||
f'CONTENT: {self.content}\n'
|
f'Content: {self.content}\n'
|
||||||
)
|
)
|
||||||
|
ret += '--- Agent Observation ---\n'
|
||||||
|
ret += self.get_agent_obs_text()
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def get_agent_obs_text(self) -> str:
|
||||||
|
"""Get a concise text that will be shown to the agent."""
|
||||||
|
text = f'[Current URL: {self.url}]\n'
|
||||||
|
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
|
||||||
|
if self.error:
|
||||||
|
text += (
|
||||||
|
'================ BEGIN error message ===============\n'
|
||||||
|
'The following error occurred when executing the last action:\n'
|
||||||
|
f'{self.last_browser_action_error}\n'
|
||||||
|
'================ END error message ===============\n'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
text += '[Action executed successfully.]\n'
|
||||||
|
|
||||||
|
try:
|
||||||
|
# We do not filter visible only here because we want to show the full content
|
||||||
|
# of the web page to the agent for simplicity.
|
||||||
|
# FIXME: handle the case when the web page is too large
|
||||||
|
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
|
||||||
|
text += (
|
||||||
|
f'============== BEGIN accessibility tree ==============\n'
|
||||||
|
f'{cur_axtree_txt}\n'
|
||||||
|
f'============== END accessibility tree ==============\n'
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
|
||||||
|
return text
|
||||||
|
|
||||||
|
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
|
||||||
|
cur_axtree_txt = flatten_axtree_to_str(
|
||||||
|
self.axtree_object,
|
||||||
|
extra_properties=self.extra_element_properties,
|
||||||
|
with_clickable=True,
|
||||||
|
skip_generic=False,
|
||||||
|
filter_visible_only=filter_visible_only,
|
||||||
|
)
|
||||||
|
self._axtree_str = cur_axtree_txt
|
||||||
|
return cur_axtree_txt
|
||||||
|
|||||||
@@ -81,7 +81,10 @@ class BrowserEnv:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
|
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
|
||||||
)
|
)
|
||||||
env = gym.make(self.browsergym_eval_env)
|
env = gym.make(
|
||||||
|
self.browsergym_eval_env,
|
||||||
|
tags_to_mark='all',
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
env = gym.make(
|
env = gym.make(
|
||||||
'browsergym/openended',
|
'browsergym/openended',
|
||||||
@@ -89,6 +92,7 @@ class BrowserEnv:
|
|||||||
wait_for_user_message=False,
|
wait_for_user_message=False,
|
||||||
headless=True,
|
headless=True,
|
||||||
disable_env_checker=True,
|
disable_env_checker=True,
|
||||||
|
tags_to_mark='all',
|
||||||
)
|
)
|
||||||
|
|
||||||
obs, info = env.reset()
|
obs, info = env.reset()
|
||||||
|
|||||||
Reference in New Issue
Block a user