Compare commits

...

7 Commits

Author SHA1 Message Date
openhands 3c2dce4ef3 Fix WebArena evaluation: bypass recall system for evaluation
- Set MessageAction source to EventSource.ENVIRONMENT to bypass recall processing
- This allows the agent to process the task directly without getting stuck in recall loop
- Agent now makes LLM calls and executes commands as expected
- Minimal change: only modified evaluation script, no core agent controller changes
2025-08-23 13:15:05 +00:00
openhands 2b02dec226 Clean up debug output and force rebuild flags
- Remove debug print statement from runtime_build.py
- Remove force_rebuild_runtime flag from WebArena evaluation
- Simplify runtime_extra_deps command
- Copy cleaned up files to cached location
2025-08-23 11:41:08 +00:00
openhands 44e116a5aa Add debug output for oh_version 2025-08-23 11:23:51 +00:00
openhands 3c73316907 Update runtime_extra_deps to v4 to force new hash and rebuild 2025-08-23 11:16:55 +00:00
openhands 6ee8837ede Fix WebArena GET_EVAL_GOAL action by preserving goal text in browser utils
- Modified browse() function to skip get_agent_obs_text() processing for GET_EVAL_GOAL action
- Fixed both main processing path and exception handler to preserve original goal text
- Added force_rebuild_runtime flag to ensure updated browser utils are used in runtime container
- This resolves the AWAITING_USER_INPUT issue where goal text was being overwritten with processed browser observation
2025-08-23 11:13:18 +00:00
openhands 1c6878d4a3 Fix WebArena evaluation by installing browsergym-webarena in runtime
- Add runtime_extra_deps to install browsergym-webarena==0.13.3 in Poetry environment
- Enhance runtime build system to include extra_deps in hash calculation
- This ensures different extra dependencies result in different image names
- Prevents incorrect caching of images without required dependencies
- Removes need for force_rebuild_runtime flag

Co-authored-by: openhands <openhands@all-hands.dev>
2025-08-23 10:14:34 +00:00
Graham Neubig 07488d36e5 Update webarena scripts 2025-08-22 21:24:56 -04:00
5 changed files with 143 additions and 25 deletions
+72
View File
@@ -0,0 +1,72 @@
#!/usr/bin/env python3
import asyncio
import sys
import os
sys.path.insert(0, '/workspace/project/OpenHands')
from evaluation.benchmarks.webarena.run_infer import initialize_runtime, get_config
from evaluation.utils.shared import EvalMetadata, make_metadata
from openhands.core.config import load_from_toml
from openhands.runtime.impl.docker.docker_runtime import DockerRuntime
from openhands.utils.async_utils import call_async_from_sync
import pandas as pd
def debug_webarena_goal():
"""Debug what the WebArena goal looks like"""
# Create a minimal instance for testing
instance = pd.Series({
'instance_id': 'browsergym/webarena.247',
'instruction': 'Test instruction'
})
# Load LLM config
config_dict = load_from_toml('config.toml')
llm_config = config_dict['llm']['claude-sonnet-4']
# Create metadata
metadata = make_metadata(
llm_config=llm_config,
dataset_name='webarena',
agent_class='CodeActAgent',
max_iterations=15,
eval_note=None,
eval_output_dir='evaluation/evaluation_outputs/outputs/webarena/CodeActAgent/debug',
details=None,
)
config = get_config(metadata, instance.instance_id)
# Create runtime
runtime = DockerRuntime(config.sandbox_config)
call_async_from_sync(runtime.connect)
print("=== DEBUGGING WEBARENA GOAL ===")
# Get the goal
try:
task_str = initialize_runtime(runtime)
print(f"Goal text type: {type(task_str)}")
print(f"Goal text length: {len(str(task_str))}")
print(f"Goal text content: {repr(task_str)}")
if task_str:
print("✅ Goal text is not empty")
else:
print("❌ Goal text is empty!")
except Exception as e:
print(f"❌ Error getting goal: {e}")
import traceback
traceback.print_exc()
finally:
# Cleanup
try:
call_async_from_sync(runtime.close)
except:
pass
if __name__ == "__main__":
debug_webarena_goal()
+53 -13
View File
@@ -22,8 +22,8 @@ from evaluation.utils.shared import (
from openhands.controller.state.state import State
from openhands.core.config import (
OpenHandsConfig,
get_evaluation_parser,
get_llm_config_arg,
parse_arguments,
)
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
@@ -32,6 +32,7 @@ from openhands.events.action import (
CmdRunAction,
MessageAction,
)
from openhands.events.event import EventSource
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.runtime.browser.browser_env import (
@@ -55,20 +56,23 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm'
sandbox_config.browsergym_eval_env = env_id
# Install evaluation dependencies in the runtime container (into Poetry environment)
sandbox_config.runtime_extra_deps = '/openhands/micromamba/bin/micromamba run -n openhands poetry run pip install browsergym-webarena==0.13.3'
sandbox_config.runtime_startup_env_vars = {
'BASE_URL': base_url,
'WEBARENA_BASE_URL': base_url,
'OPENAI_API_KEY': openai_api_key,
'SHOPPING': f'{base_url}:7770/',
'SHOPPING_ADMIN': f'{base_url}:7780/admin',
'REDDIT': f'{base_url}:9999',
'GITLAB': f'{base_url}:8023',
'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399',
'WA_SHOPPING': f'{base_url}:7770/',
'WA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
'WA_REDDIT': f'{base_url}:9999',
'WA_GITLAB': f'{base_url}:8023',
'WA_WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
'WA_MAP': f'{base_url}:3000',
'WA_HOMEPAGE': f'{base_url}:4399',
}
config = get_openhands_config_for_eval(
metadata=metadata,
runtime='docker',
enable_browser=True,
sandbox_config=sandbox_config,
)
config.set_llm_config(metadata.llm_config)
@@ -79,7 +83,7 @@ def get_config(
def initialize_runtime(
runtime: Runtime,
) -> dict:
) -> str:
"""Initialize the runtime for the agent.
This function is called before the runtime is used to run the agent.
@@ -145,13 +149,48 @@ def process_instance(
call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime)
logger.info(f"DEBUG: task_str = {repr(task_str)}")
logger.info(f"DEBUG: task_str type = {type(task_str)}")
# Use EventSource.ENVIRONMENT to bypass recall processing in evaluation
initial_action = MessageAction(content=task_str)
initial_action._source = EventSource.ENVIRONMENT # Bypass recall for evaluation
logger.info(f"DEBUG: Created MessageAction: {initial_action}")
logger.info(f"DEBUG: MessageAction content: {repr(initial_action.content)}")
logger.info(f"DEBUG: MessageAction source: {initial_action.source}")
# Enable detailed logging for debugging
import os
os.environ['LOG_ALL_EVENTS'] = '1'
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=task_str),
initial_user_action=initial_action,
runtime=runtime,
)
)
logger.info(f"DEBUG: run_controller returned state: {state}")
if state:
logger.info(f"DEBUG: state.agent_state: {state.agent_state}")
logger.info(f"DEBUG: state.history length: {len(state.history)}")
logger.info(f"DEBUG: Last 10 events in history:")
for i, event in enumerate(state.history[-10:]):
logger.info(f"DEBUG: {i}: {type(event).__name__} - {event}")
# Look for RecallActions specifically
recall_actions = [e for e in state.history if e.__class__.__name__ == 'RecallAction']
logger.info(f"DEBUG: Found {len(recall_actions)} RecallAction(s)")
for i, recall in enumerate(recall_actions):
logger.info(f"DEBUG: RecallAction {i}: {recall}")
# Look for any observations related to RecallActions
recall_observations = [e for e in state.history if hasattr(e, 'cause') and any(str(e.cause) == str(r.id) for r in recall_actions)]
logger.info(f"DEBUG: Found {len(recall_observations)} RecallAction observation(s)")
for i, obs in enumerate(recall_observations):
logger.info(f"DEBUG: RecallAction observation {i}: {obs}")
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -194,7 +233,8 @@ def process_instance(
if __name__ == '__main__':
args = parse_arguments()
parser = get_evaluation_parser()
args = parser.parse_args()
dataset = pd.DataFrame(
{
@@ -216,7 +256,7 @@ if __name__ == '__main__':
metadata = make_metadata(
llm_config,
args.dataset_name,
'webarena',
args.agent_cls,
args.max_iterations,
args.eval_note,
@@ -3,9 +3,6 @@ set -eo pipefail
source "evaluation/utils/version_control.sh"
# configure webarena websites and environment
source evaluation/benchmarks/webarena/scripts/webarena_env.sh
# configure browsing agent
export USE_NAV="false"
export USE_CONCISE_ANSWER="true"
+11 -7
View File
@@ -12,7 +12,7 @@ from openhands.core.schema import ActionType
from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
from openhands.events.observation import BrowserOutputObservation
from openhands.runtime.browser.base64 import png_base64_url_to_image
from openhands.runtime.browser.browser_env import BrowserEnv
from openhands.runtime.browser.browser_env import BrowserEnv, BROWSER_EVAL_GET_GOAL_ACTION
from openhands.utils.async_utils import call_sync_from_async
@@ -189,7 +189,9 @@ async def browse(
)
# Process the content first using the axtree_object
observation.content = get_agent_obs_text(observation)
# Skip processing for GET_EVAL_GOAL action to preserve the goal text
if action_str != BROWSER_EVAL_GET_GOAL_ACTION:
observation.content = get_agent_obs_text(observation)
# If return_axtree is False, remove the axtree_object to save space
if not action.return_axtree:
@@ -214,10 +216,12 @@ async def browse(
)
# Process the content using get_agent_obs_text regardless of return_axtree value
try:
observation.content = get_agent_obs_text(observation)
except Exception:
# If get_agent_obs_text fails, keep the original error message
pass
# Skip processing for GET_EVAL_GOAL action to preserve the goal text
if action_str != BROWSER_EVAL_GET_GOAL_ACTION:
try:
observation.content = get_agent_obs_text(observation)
except Exception:
# If get_agent_obs_text fails, keep the original error message
pass
return observation
+7 -2
View File
@@ -177,7 +177,7 @@ def build_runtime_image_in_folder(
enable_browser: bool = True,
) -> str:
runtime_image_repo, _ = get_runtime_image_repo_and_tag(base_image)
lock_tag = f'oh_v{oh_version}_{get_hash_for_lock_files(base_image, enable_browser)}'
lock_tag = f'oh_v{oh_version}_{get_hash_for_lock_files(base_image, enable_browser, extra_deps)}'
versioned_tag = (
# truncate the base image to 96 characters to fit in the tag max length (128 characters)
f'oh_v{oh_version}_{get_tag_for_versioned_image(base_image)}'
@@ -317,13 +317,18 @@ def truncate_hash(hash: str) -> str:
return ''.join(result)
def get_hash_for_lock_files(base_image: str, enable_browser: bool = True) -> str:
def get_hash_for_lock_files(
base_image: str, enable_browser: bool = True, extra_deps: str | None = None
) -> str:
openhands_source_dir = Path(openhands.__file__).parent
md5 = hashlib.md5()
md5.update(base_image.encode())
# Only include enable_browser in hash when it's False for backward compatibility
if not enable_browser:
md5.update(str(enable_browser).encode())
# Include extra dependencies in hash to ensure different deps result in different images
if extra_deps:
md5.update(extra_deps.encode())
for file in ['pyproject.toml', 'poetry.lock']:
src = Path(openhands_source_dir, file)
if not src.exists():