Save trajectory to workspace/trajectory.json after agent run

revert docker impl
Refactor regression tests to use AppConfig directly
2026-04-29 03:00:45 -04:00 · 2025-02-23 01:19:22 +00:00 · 2025-02-22 20:09:08 -05:00 · 2025-02-23 01:07:27 +00:00 · 2025-02-22 19:52:33 -05:00 · 2025-02-22 19:51:34 -05:00
5 changed files with 192 additions and 0 deletions
@@ -233,3 +233,6 @@ containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
 containers/runtime/code
 **/node_modules/
+
+# regression test workspaces
+tests/regression/cases/*/workspace/
@@ -0,0 +1,2 @@
+timeout: 120  # 2 minutes
+required: true
@@ -0,0 +1 @@
+Create a bash script called hello.sh that prints "hello world"
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+set +x
+
+echo "checking hello world"
+pwd
+ls -lah
+
+# Check if hello.sh exists
+if [ ! -f hello.sh ]; then
+    echo "hello.sh does not exist"
+    exit 1
+fi
+
+# Check if it's executable
+if [ ! -x hello.sh ]; then
+    echo "hello.sh is not executable"
+    exit 1
+fi
+
+# Run and check output
+output=$(./hello.sh)
+if [ "$output" != "hello world" ]; then
+    echo "Expected 'hello world' but got: $output"
+    exit 1
+fi
+
+exit 0
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+import asyncio
+import os
+import shutil
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import yaml
+
+import openhands.agenthub  # noqa: F401 - import to register agents
+from openhands.core.config import AppConfig
+from openhands.core.main import auto_continue_response, run_controller
+from openhands.events.action import MessageAction
+
+
+def run_test_case(case_dir: Path) -> bool:
+    """Run a single test case.
+
+    Args:
+        case_dir: Path to the test case directory
+
+    Returns:
+        bool: True if test passed, False if failed
+    """
+    case_name = case_dir.name
+    print(f'Running test case: {case_name}')
+
+    # Read case configuration
+    timeout = 120  # Default timeout 2 minutes
+    required = True
+    case_yaml = case_dir / 'case.yaml'
+
+    if case_yaml.exists():
+        with open(case_yaml) as f:
+            config: Optional[Dict[str, Any]] = yaml.safe_load(f)
+            if config:
+                timeout = config.get('timeout', timeout)
+                required = config.get('required', required)
+
+    # Create workspace directory
+    workspace_dir = case_dir / 'workspace'
+    if workspace_dir.exists():
+        # Clean up any existing workspace
+        shutil.rmtree(workspace_dir)
+    workspace_dir.mkdir(exist_ok=True)
+    temp_path = workspace_dir
+    temp_dir = str(workspace_dir)
+    temp_dir_ctx = None
+
+    if not os.getenv('NO_CLEANUP'):
+
+        class WorkspaceCleanup:
+            def __init__(self, workspace_path: Path):
+                self.workspace_path = workspace_path
+
+            def cleanup(self):
+                if self.workspace_path.exists():
+                    shutil.rmtree(self.workspace_path)
+
+        temp_dir_ctx = WorkspaceCleanup(workspace_dir)
+
+    try:
+        # Check if git repo and commit-ish are specified
+        if case_yaml.exists():
+            with open(case_yaml) as f:
+                config = yaml.safe_load(f)
+                if config and 'git' in config:
+                    repo = config['git']
+                    commit = config.get('commit-ish', 'main')
+                    os.system(f'git clone {repo} {temp_dir}')
+                    os.system(f'cd {temp_dir} && git checkout {commit}')
+
+        # Copy prompt and test script
+        shutil.copy2(case_dir / 'prompt.txt', temp_path / 'prompt.txt')
+        shutil.copy2(case_dir / 'test.sh', temp_path / 'test.sh')
+        os.chmod(temp_path / 'test.sh', 0o755)  # Make test.sh executable
+
+        # Read the prompt
+        with open(case_dir / 'prompt.txt') as f:
+            task_str = f.read()
+
+        # Set up OpenHands configuration
+        config = AppConfig()
+        config.name = case_name
+        config.agent_cls = 'CodeActAgent'
+        config.max_budget_per_task = 100
+        config.max_iterations = 100
+        config.cli_multiline_input = False
+        config.config_file = str(Path(__file__).parent.parent.parent / 'config.toml')
+        config.workspace_base = str(temp_path)
+        config.workspace_mount_path = str(temp_path)
+        config.workspace_mount_path_in_sandbox = '/workspace'
+        config.sandbox.keep_runtime_alive = False
+        config.save_trajectory_path = str(temp_path / 'trajectory.json')
+        initial_user_action = MessageAction(content=task_str)
+
+        # Change to temp directory for test execution
+        original_cwd = os.getcwd()
+        os.chdir(temp_dir)
+
+        try:
+            # Run OpenHands
+            asyncio.run(
+                run_controller(
+                    config=config,
+                    initial_user_action=initial_user_action,
+                    fake_user_response_fn=auto_continue_response,
+                    headless_mode=True,
+                )
+            )
+
+            # Run the test script
+            test_result = os.system('./test.sh')
+            if test_result != 0:
+                print(f'Test case {case_name} failed')
+                if required:
+                    return False
+            else:
+                print(f'Test case {case_name} passed')
+                return True
+
+        except Exception as e:
+            print(f'Error running test case {case_name}: {e}')
+            if required:
+                return False
+            return True
+        finally:
+            os.chdir(original_cwd)
+    finally:
+        if temp_dir_ctx is not None:
+            temp_dir_ctx.cleanup()
+
+    return True
+
+
+def main() -> None:
+    """Run all regression tests."""
+    # Find and run all test cases
+    regression_dir = Path(__file__).parent
+    cases_dir = regression_dir / 'cases'
+
+    all_passed = True
+    for case_dir in cases_dir.iterdir():
+        if case_dir.is_dir():
+            if not run_test_case(case_dir):
+                all_passed = False
+
+    if all_passed:
+        print('All tests completed successfully')
+        sys.exit(0)
+    else:
+        print('Some tests failed')
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
Author	SHA1	Message	Date
openhands	e4d870f9ed	Save trajectory to workspace/trajectory.json after agent run	2025-02-23 01:19:22 +00:00
Robert Brennan	1183dd2e0c	revert docker impl	2025-02-22 20:09:08 -05:00
openhands	fe1052c35f	Refactor regression tests to use AppConfig directly	2025-02-23 01:07:27 +00:00
Robert Brennan	9dbd1bf901	Merge branch 'add-regression-tests' of ssh://github.com/All-Hands-AI/OpenHands into add-regression-tests	2025-02-22 19:52:33 -05:00
Robert Brennan	471aba2b57	fix rewrites	2025-02-22 19:51:34 -05:00
openhands	0d6ea9501e	Fix workspace cleanup function to handle self parameter	2025-02-23 00:43:07 +00:00
openhands	d62adb5c2a	Add WORKSPACE_MOUNT_REPLACEMENT environment variable support	2025-02-23 00:34:04 +00:00
openhands	f5c44af7f0	Move regression test workspaces into test case directories	2025-02-23 00:32:59 +00:00
openhands	38bfd14e0a	Add detailed debug logs for workspace mounting in Docker runtime	2025-02-23 00:25:43 +00:00
openhands	bbaa1d9d1e	Let environment variables or config file set the LLM model	2025-02-23 00:19:57 +00:00
openhands	b7a5c48de5	Fix LLM model name to use gpt-4o	2025-02-23 00:17:12 +00:00
openhands	fadba6d779	Ensure Docker containers are cleaned up by setting keep_runtime_alive=False	2025-02-23 00:02:57 +00:00
openhands	d819fa5750	Add type hints and fix formatting	2025-02-22 23:59:08 +00:00
openhands	66a60eafdb	Update setup_config_from_args to handle all args from regression tests	2025-02-22 23:53:39 +00:00
Robert Brennan	dc37ad1433	fix lint	2025-02-22 18:51:01 -05:00
openhands	dbabaf2591	Update run.py to use CodeActAgent and add proper workspace mounting	2025-02-22 23:44:16 +00:00
openhands	f232ad8e3b	Configure workspace mounting for Docker runtime	2025-02-22 23:38:13 +00:00
openhands	5ebdd4ee93	Add NO_CLEANUP environment variable option to preserve test directories	2025-02-22 23:35:41 +00:00
openhands	76fcfba538	Update run.py to use CodeActAgent	2025-02-22 23:27:06 +00:00
openhands	7f6882a3bd	Refactor run.py to use OpenHands core directly	2025-02-22 23:19:24 +00:00
openhands	261e618a4b	Convert run.sh to run.py for better maintainability and cross-platform support	2025-02-22 23:12:07 +00:00
openhands	8d471aa2c2	Add regression test framework with hello world test case	2025-02-22 22:51:39 +00:00
				`@@ -0,0 +1 @@`
				`Create a bash script called hello.sh that prints "hello world"`