diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 00d7c45957..1df5bcb390 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -40,6 +40,11 @@ jobs: python-version: ${{ matrix.python-version }} cache: "poetry" + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '22.x' + - name: Comment on PR if 'integration-test' label is present if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' uses: KeisukeYamashita/create-comment@v1 diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index f216a86ff8..636a52e2bd 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -24,7 +24,6 @@ from openhands.core.config import ( from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction -from openhands.utils.async_utils import call_async_from_sync game = None @@ -122,7 +121,6 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) - call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index a78e402395..68cf2ff793 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -34,7 +34,6 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync def get_config( @@ -211,7 +210,6 @@ def process_instance( # ============================================= runtime: Runtime = create_runtime(config) - call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index ae5faadc09..9c848f67b1 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -34,7 +34,6 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync # Configure visibility of unit tests to the Agent. USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true' @@ -204,7 +203,6 @@ def process_instance( # ============================================= runtime: Runtime = create_runtime(config) - call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index f1c98ed066..b0a06a6ece 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -31,7 +31,6 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': functools.partial( @@ -275,7 +274,6 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] runtime = create_runtime(config) - call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 1c56deb967..4cdd29862f 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -34,7 +34,6 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync def codeact_user_response(state: State) -> str: @@ -400,7 +399,6 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] runtime = create_runtime(config) - call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index 63d394a029..cf6148975b 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -35,7 +35,6 @@ from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation, ErrorObservation from openhands.events.serialization.event import event_to_dict from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync from openhands.utils.shutdown_listener import sleep_if_should_continue USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' @@ -395,7 +394,6 @@ def process_instance( logger.info(f'Starting evaluation for instance {instance.instance_id}.') runtime = create_runtime(config) - call_async_from_sync(runtime.connect) try: initialize_runtime(runtime, instance) diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index d91d01194d..f3fdadab8e 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -34,7 +34,6 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync EVALUATION_LLM = 'gpt-4-1106-preview' @@ -282,7 +281,6 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) - call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance.data_files) state: State | None = asyncio.run( diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index e63026e813..e0e5ed0363 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -31,7 +31,6 @@ from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime -from openhands.utils.async_utils import call_async_from_sync DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -149,7 +148,6 @@ def process_instance( logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) runtime = create_runtime(config) - call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index e856fa267c..22b42a8545 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -26,7 +26,6 @@ from openhands.core.config import ( from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction -from openhands.utils.async_utils import call_async_from_sync AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, @@ -83,7 +82,6 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) - call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( config=config, diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index e297e3fb9e..0f19755c34 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -49,7 +49,6 @@ from openhands.events.action import ( MessageAction, ) from openhands.events.observation import Observation -from openhands.utils.async_utils import call_async_from_sync ACTION_FORMAT = """ < argparse.ArgumentParser: parser.add_argument( '-n', '--name', - default='', + help='Session name', type=str, - help='Name for the session', + default='', ) parser.add_argument( '--eval-ids', @@ -487,8 +487,15 @@ def get_parser() -> argparse.ArgumentParser: ) parser.add_argument( '--no-auto-continue', + help='Disable auto-continue responses in headless mode (i.e. headless will read from stdin instead of auto-continuing)', action='store_true', - help='Disable automatic "continue" responses in headless mode. Will read from stdin instead.', + default=False, + ) + parser.add_argument( + '--selected-repo', + help='GitHub repository to clone (format: owner/repo)', + type=str, + default=None, ) return parser @@ -555,4 +562,8 @@ def setup_config_from_args(args: argparse.Namespace) -> AppConfig: if args.max_budget_per_task is not None: config.max_budget_per_task = args.max_budget_per_task + # Read selected repository in config for use by CLI and main.py + if args.selected_repo is not None: + config.sandbox.selected_repo = args.selected_repo + return config diff --git a/openhands/core/main.py b/openhands/core/main.py index 3c8efe38ab..ab642553b3 100644 --- a/openhands/core/main.py +++ b/openhands/core/main.py @@ -88,15 +88,20 @@ async def run_controller( """ sid = sid or generate_sid(config) + if agent is None: + agent = create_agent(config) + if runtime is None: - runtime = create_runtime(config, sid=sid, headless_mode=headless_mode) - await runtime.connect() + runtime = create_runtime( + config, + sid=sid, + headless_mode=headless_mode, + agent=agent, + selected_repository=config.sandbox.selected_repo, + ) event_stream = runtime.event_stream - if agent is None: - agent = create_agent(runtime, config) - replay_events: list[Event] | None = None if config.replay_trajectory_path: logger.info('Trajectory replay is enabled') diff --git a/openhands/core/setup.py b/openhands/core/setup.py index 82bdaf0c20..9142be6713 100644 --- a/openhands/core/setup.py +++ b/openhands/core/setup.py @@ -1,7 +1,10 @@ import hashlib +import os import uuid from typing import Tuple, Type +from pydantic import SecretStr + import openhands.agenthub # noqa F401 (we import this to get the agents registered) from openhands.controller import AgentController from openhands.controller.agent import Agent @@ -13,16 +16,21 @@ from openhands.core.logger import openhands_logger as logger from openhands.events import EventStream from openhands.events.event import Event from openhands.llm.llm import LLM +from openhands.microagent.microagent import BaseMicroAgent from openhands.runtime import get_runtime_cls from openhands.runtime.base import Runtime from openhands.security import SecurityAnalyzer, options from openhands.storage import get_file_store +from openhands.utils.async_utils import call_async_from_sync def create_runtime( config: AppConfig, sid: str | None = None, headless_mode: bool = True, + agent: Agent | None = None, + selected_repository: str | None = None, + github_token: SecretStr | None = None, ) -> Runtime: """Create a runtime for the agent to run on. @@ -31,6 +39,8 @@ def create_runtime( Set it to incompatible value will cause unexpected behavior on RemoteRuntime. headless_mode: Whether the agent is run in headless mode. `create_runtime` is typically called within evaluation scripts, where we don't want to have the VSCode UI open, so it defaults to True. + selected_repository: (optional) The GitHub repository to use. + github_token: (optional) The GitHub token to use. """ # if sid is provided on the command line, use it as the name of the event stream # otherwise generate it on the basis of the configured jwt_secret @@ -41,8 +51,17 @@ def create_runtime( file_store = get_file_store(config.file_store, config.file_store_path) event_stream = EventStream(session_id, file_store) + # set up the security analyzer + if config.security.security_analyzer: + options.SecurityAnalyzers.get( + config.security.security_analyzer, SecurityAnalyzer + )(event_stream) + # agent class - agent_cls = openhands.agenthub.Agent.get_cls(config.default_agent) + if agent: + agent_cls = type(agent) + else: + agent_cls = openhands.agenthub.Agent.get_cls(config.default_agent) # runtime and tools runtime_cls = get_runtime_cls(config.runtime) @@ -55,10 +74,38 @@ def create_runtime( headless_mode=headless_mode, ) + call_async_from_sync(runtime.connect) + + # clone selected repository if provided + repo_directory = None + github_token = ( + SecretStr(os.environ.get('GITHUB_TOKEN')) if not github_token else github_token + ) + if selected_repository and github_token: + logger.debug(f'Selected repository {selected_repository}.') + repo_directory = runtime.clone_repo( + github_token, + selected_repository, + None, + ) + + # load microagents from selected repository + if agent and agent.prompt_manager and selected_repository and repo_directory: + agent.prompt_manager.set_runtime_info(runtime) + microagents: list[BaseMicroAgent] = runtime.get_microagents_from_selected_repo( + selected_repository + ) + agent.prompt_manager.load_microagents(microagents) + agent.prompt_manager.set_repository_info(selected_repository, repo_directory) + + logger.debug( + f'Runtime initialized with plugins: {[plugin.name for plugin in runtime.plugins]}' + ) + return runtime -def create_agent(runtime: Runtime, config: AppConfig) -> Agent: +def create_agent(config: AppConfig) -> Agent: agent_cls: Type[Agent] = Agent.get_cls(config.default_agent) agent_config = config.get_agent_config(config.default_agent) llm_config = config.get_llm_config_from_agent(config.default_agent) @@ -66,14 +113,6 @@ def create_agent(runtime: Runtime, config: AppConfig) -> Agent: llm=LLM(config=llm_config), config=agent_config, ) - if agent.prompt_manager: - microagents = runtime.get_microagents_from_selected_repo(None) - agent.prompt_manager.load_microagents(microagents) - - if config.security.security_analyzer: - options.SecurityAnalyzers.get( - config.security.security_analyzer, SecurityAnalyzer - )(runtime.event_stream) return agent diff --git a/tests/unit/test_arg_parser.py b/tests/unit/test_arg_parser.py index b71cd5e2c1..56f587470d 100644 --- a/tests/unit/test_arg_parser.py +++ b/tests/unit/test_arg_parser.py @@ -20,6 +20,7 @@ def test_parser_default_values(): assert args.llm_config is None assert args.name == '' assert not args.no_auto_continue + assert args.selected_repo is None def test_parser_custom_values(): @@ -52,6 +53,8 @@ def test_parser_custom_values(): '-n', 'test_session', '--no-auto-continue', + '--selected-repo', + 'owner/repo', ] ) @@ -69,6 +72,7 @@ def test_parser_custom_values(): assert args.name == 'test_session' assert args.no_auto_continue assert args.version + assert args.selected_repo == 'owner/repo' def test_parser_file_overrides_task(): @@ -132,10 +136,18 @@ def test_help_message(capsys): '-n NAME, --name NAME', '--config-file CONFIG_FILE', '--no-auto-continue', + '--selected-repo SELECTED_REPO', ] for element in expected_elements: assert element in help_output, f"Expected '{element}' to be in the help message" option_count = help_output.count(' -') - assert option_count == 18, f'Expected 18 options, found {option_count}' + assert option_count == 19, f'Expected 19 options, found {option_count}' + + +def test_selected_repo_format(): + """Test that the selected-repo argument accepts owner/repo format.""" + parser = get_parser() + args = parser.parse_args(['--selected-repo', 'owner/repo']) + assert args.selected_repo == 'owner/repo'