Compare commits

...

22 Commits

Author SHA1 Message Date
openhands e4d870f9ed Save trajectory to workspace/trajectory.json after agent run 2025-02-23 01:19:22 +00:00
Robert Brennan 1183dd2e0c revert docker impl 2025-02-22 20:09:08 -05:00
openhands fe1052c35f Refactor regression tests to use AppConfig directly 2025-02-23 01:07:27 +00:00
Robert Brennan 9dbd1bf901 Merge branch 'add-regression-tests' of ssh://github.com/All-Hands-AI/OpenHands into add-regression-tests 2025-02-22 19:52:33 -05:00
Robert Brennan 471aba2b57 fix rewrites 2025-02-22 19:51:34 -05:00
openhands 0d6ea9501e Fix workspace cleanup function to handle self parameter 2025-02-23 00:43:07 +00:00
openhands d62adb5c2a Add WORKSPACE_MOUNT_REPLACEMENT environment variable support 2025-02-23 00:34:04 +00:00
openhands f5c44af7f0 Move regression test workspaces into test case directories 2025-02-23 00:32:59 +00:00
openhands 38bfd14e0a Add detailed debug logs for workspace mounting in Docker runtime 2025-02-23 00:25:43 +00:00
openhands bbaa1d9d1e Let environment variables or config file set the LLM model 2025-02-23 00:19:57 +00:00
openhands b7a5c48de5 Fix LLM model name to use gpt-4o 2025-02-23 00:17:12 +00:00
openhands fadba6d779 Ensure Docker containers are cleaned up by setting keep_runtime_alive=False 2025-02-23 00:02:57 +00:00
openhands d819fa5750 Add type hints and fix formatting 2025-02-22 23:59:08 +00:00
openhands 66a60eafdb Update setup_config_from_args to handle all args from regression tests 2025-02-22 23:53:39 +00:00
Robert Brennan dc37ad1433 fix lint 2025-02-22 18:51:01 -05:00
openhands dbabaf2591 Update run.py to use CodeActAgent and add proper workspace mounting 2025-02-22 23:44:16 +00:00
openhands f232ad8e3b Configure workspace mounting for Docker runtime 2025-02-22 23:38:13 +00:00
openhands 5ebdd4ee93 Add NO_CLEANUP environment variable option to preserve test directories 2025-02-22 23:35:41 +00:00
openhands 76fcfba538 Update run.py to use CodeActAgent 2025-02-22 23:27:06 +00:00
openhands 7f6882a3bd Refactor run.py to use OpenHands core directly 2025-02-22 23:19:24 +00:00
openhands 261e618a4b Convert run.sh to run.py for better maintainability and cross-platform support 2025-02-22 23:12:07 +00:00
openhands 8d471aa2c2 Add regression test framework with hello world test case 2025-02-22 22:51:39 +00:00
5 changed files with 192 additions and 0 deletions
+3
View File
@@ -233,3 +233,6 @@ containers/runtime/Dockerfile
containers/runtime/project.tar.gz
containers/runtime/code
**/node_modules/
# regression test workspaces
tests/regression/cases/*/workspace/
@@ -0,0 +1,2 @@
timeout: 120 # 2 minutes
required: true
@@ -0,0 +1 @@
Create a bash script called hello.sh that prints "hello world"
+28
View File
@@ -0,0 +1,28 @@
#!/bin/bash
set -e
set +x
echo "checking hello world"
pwd
ls -lah
# Check if hello.sh exists
if [ ! -f hello.sh ]; then
echo "hello.sh does not exist"
exit 1
fi
# Check if it's executable
if [ ! -x hello.sh ]; then
echo "hello.sh is not executable"
exit 1
fi
# Run and check output
output=$(./hello.sh)
if [ "$output" != "hello world" ]; then
echo "Expected 'hello world' but got: $output"
exit 1
fi
exit 0
+158
View File
@@ -0,0 +1,158 @@
#!/usr/bin/env python3
import asyncio
import os
import shutil
import sys
from pathlib import Path
from typing import Any, Dict, Optional
import yaml
import openhands.agenthub # noqa: F401 - import to register agents
from openhands.core.config import AppConfig
from openhands.core.main import auto_continue_response, run_controller
from openhands.events.action import MessageAction
def run_test_case(case_dir: Path) -> bool:
"""Run a single test case.
Args:
case_dir: Path to the test case directory
Returns:
bool: True if test passed, False if failed
"""
case_name = case_dir.name
print(f'Running test case: {case_name}')
# Read case configuration
timeout = 120 # Default timeout 2 minutes
required = True
case_yaml = case_dir / 'case.yaml'
if case_yaml.exists():
with open(case_yaml) as f:
config: Optional[Dict[str, Any]] = yaml.safe_load(f)
if config:
timeout = config.get('timeout', timeout)
required = config.get('required', required)
# Create workspace directory
workspace_dir = case_dir / 'workspace'
if workspace_dir.exists():
# Clean up any existing workspace
shutil.rmtree(workspace_dir)
workspace_dir.mkdir(exist_ok=True)
temp_path = workspace_dir
temp_dir = str(workspace_dir)
temp_dir_ctx = None
if not os.getenv('NO_CLEANUP'):
class WorkspaceCleanup:
def __init__(self, workspace_path: Path):
self.workspace_path = workspace_path
def cleanup(self):
if self.workspace_path.exists():
shutil.rmtree(self.workspace_path)
temp_dir_ctx = WorkspaceCleanup(workspace_dir)
try:
# Check if git repo and commit-ish are specified
if case_yaml.exists():
with open(case_yaml) as f:
config = yaml.safe_load(f)
if config and 'git' in config:
repo = config['git']
commit = config.get('commit-ish', 'main')
os.system(f'git clone {repo} {temp_dir}')
os.system(f'cd {temp_dir} && git checkout {commit}')
# Copy prompt and test script
shutil.copy2(case_dir / 'prompt.txt', temp_path / 'prompt.txt')
shutil.copy2(case_dir / 'test.sh', temp_path / 'test.sh')
os.chmod(temp_path / 'test.sh', 0o755) # Make test.sh executable
# Read the prompt
with open(case_dir / 'prompt.txt') as f:
task_str = f.read()
# Set up OpenHands configuration
config = AppConfig()
config.name = case_name
config.agent_cls = 'CodeActAgent'
config.max_budget_per_task = 100
config.max_iterations = 100
config.cli_multiline_input = False
config.config_file = str(Path(__file__).parent.parent.parent / 'config.toml')
config.workspace_base = str(temp_path)
config.workspace_mount_path = str(temp_path)
config.workspace_mount_path_in_sandbox = '/workspace'
config.sandbox.keep_runtime_alive = False
config.save_trajectory_path = str(temp_path / 'trajectory.json')
initial_user_action = MessageAction(content=task_str)
# Change to temp directory for test execution
original_cwd = os.getcwd()
os.chdir(temp_dir)
try:
# Run OpenHands
asyncio.run(
run_controller(
config=config,
initial_user_action=initial_user_action,
fake_user_response_fn=auto_continue_response,
headless_mode=True,
)
)
# Run the test script
test_result = os.system('./test.sh')
if test_result != 0:
print(f'Test case {case_name} failed')
if required:
return False
else:
print(f'Test case {case_name} passed')
return True
except Exception as e:
print(f'Error running test case {case_name}: {e}')
if required:
return False
return True
finally:
os.chdir(original_cwd)
finally:
if temp_dir_ctx is not None:
temp_dir_ctx.cleanup()
return True
def main() -> None:
"""Run all regression tests."""
# Find and run all test cases
regression_dir = Path(__file__).parent
cases_dir = regression_dir / 'cases'
all_passed = True
for case_dir in cases_dir.iterdir():
if case_dir.is_dir():
if not run_test_case(case_dir):
all_passed = False
if all_passed:
print('All tests completed successfully')
sys.exit(0)
else:
print('Some tests failed')
sys.exit(1)
if __name__ == '__main__':
main()