Fix WebArena evaluation: bypass recall system for evaluation

- Set MessageAction source to EventSource.ENVIRONMENT to bypass recall processing - This allows the agent to process the task directly without getting stuck in recall loop - Agent now makes LLM calls and executes commands as expected - Minimal change: only modified evaluation script, no core agent controller changes
Clean up debug output and force rebuild flags
2026-04-29 03:00:45 -04:00 · 2025-08-23 13:15:05 +00:00 · 2025-08-23 11:41:08 +00:00 · 2025-08-23 11:23:51 +00:00 · 2025-08-23 11:16:55 +00:00 · 2025-08-23 11:13:18 +00:00
5 changed files with 143 additions and 25 deletions
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+import asyncio
+import sys
+import os
+sys.path.insert(0, '/workspace/project/OpenHands')
+
+from evaluation.benchmarks.webarena.run_infer import initialize_runtime, get_config
+from evaluation.utils.shared import EvalMetadata, make_metadata
+from openhands.core.config import load_from_toml
+from openhands.runtime.impl.docker.docker_runtime import DockerRuntime
+from openhands.utils.async_utils import call_async_from_sync
+import pandas as pd
+
+def debug_webarena_goal():
+    """Debug what the WebArena goal looks like"""
+    
+    # Create a minimal instance for testing
+    instance = pd.Series({
+        'instance_id': 'browsergym/webarena.247',
+        'instruction': 'Test instruction'
+    })
+    
+    # Load LLM config
+    config_dict = load_from_toml('config.toml')
+    llm_config = config_dict['llm']['claude-sonnet-4']
+    
+    # Create metadata
+    metadata = make_metadata(
+        llm_config=llm_config,
+        dataset_name='webarena',
+        agent_class='CodeActAgent',
+        max_iterations=15,
+        eval_note=None,
+        eval_output_dir='evaluation/evaluation_outputs/outputs/webarena/CodeActAgent/debug',
+        details=None,
+    )
+    
+    config = get_config(metadata, instance.instance_id)
+    
+    # Create runtime
+    runtime = DockerRuntime(config.sandbox_config)
+    call_async_from_sync(runtime.connect)
+    
+    print("=== DEBUGGING WEBARENA GOAL ===")
+    
+    # Get the goal
+    try:
+        task_str = initialize_runtime(runtime)
+        print(f"Goal text type: {type(task_str)}")
+        print(f"Goal text length: {len(str(task_str))}")
+        print(f"Goal text content: {repr(task_str)}")
+        
+        if task_str:
+            print("✅ Goal text is not empty")
+        else:
+            print("❌ Goal text is empty!")
+            
+    except Exception as e:
+        print(f"❌ Error getting goal: {e}")
+        import traceback
+        traceback.print_exc()
+    
+    finally:
+        # Cleanup
+        try:
+            call_async_from_sync(runtime.close)
+        except:
+            pass
+
+if __name__ == "__main__":
+    debug_webarena_goal()
@@ -22,8 +22,8 @@ from evaluation.utils.shared import (
 from openhands.controller.state.state import State
 from openhands.core.config import (
    OpenHandsConfig,
+    get_evaluation_parser,
    get_llm_config_arg,
-    parse_arguments,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -32,6 +32,7 @@ from openhands.events.action import (
    CmdRunAction,
    MessageAction,
 )
+from openhands.events.event import EventSource
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
 from openhands.runtime.browser.browser_env import (
@@ -55,20 +56,23 @@ def get_config(
    sandbox_config = get_default_sandbox_config_for_eval()
    sandbox_config.base_container_image = 'python:3.12-bookworm'
    sandbox_config.browsergym_eval_env = env_id
+    # Install evaluation dependencies in the runtime container (into Poetry environment)
+    sandbox_config.runtime_extra_deps = '/openhands/micromamba/bin/micromamba run -n openhands poetry run pip install browsergym-webarena==0.13.3'
    sandbox_config.runtime_startup_env_vars = {
-        'BASE_URL': base_url,
+        'WEBARENA_BASE_URL': base_url,
        'OPENAI_API_KEY': openai_api_key,
-        'SHOPPING': f'{base_url}:7770/',
-        'SHOPPING_ADMIN': f'{base_url}:7780/admin',
-        'REDDIT': f'{base_url}:9999',
-        'GITLAB': f'{base_url}:8023',
-        'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
-        'MAP': f'{base_url}:3000',
-        'HOMEPAGE': f'{base_url}:4399',
+        'WA_SHOPPING': f'{base_url}:7770/',
+        'WA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
+        'WA_REDDIT': f'{base_url}:9999',
+        'WA_GITLAB': f'{base_url}:8023',
+        'WA_WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
+        'WA_MAP': f'{base_url}:3000',
+        'WA_HOMEPAGE': f'{base_url}:4399',
    }
    config = get_openhands_config_for_eval(
        metadata=metadata,
        runtime='docker',
+        enable_browser=True,
        sandbox_config=sandbox_config,
    )
    config.set_llm_config(metadata.llm_config)
@@ -79,7 +83,7 @@ def get_config(

 def initialize_runtime(
    runtime: Runtime,
-) -> dict:
+) -> str:
    """Initialize the runtime for the agent.

    This function is called before the runtime is used to run the agent.
@@ -145,13 +149,48 @@ def process_instance(
    call_async_from_sync(runtime.connect)
    task_str = initialize_runtime(runtime)

+    logger.info(f"DEBUG: task_str = {repr(task_str)}")
+    logger.info(f"DEBUG: task_str type = {type(task_str)}")
+    
+    # Use EventSource.ENVIRONMENT to bypass recall processing in evaluation
+    initial_action = MessageAction(content=task_str)
+    initial_action._source = EventSource.ENVIRONMENT  # Bypass recall for evaluation
+    logger.info(f"DEBUG: Created MessageAction: {initial_action}")
+    logger.info(f"DEBUG: MessageAction content: {repr(initial_action.content)}")
+    logger.info(f"DEBUG: MessageAction source: {initial_action.source}")
+
+    # Enable detailed logging for debugging
+    import os
+    os.environ['LOG_ALL_EVENTS'] = '1'
+    
    state: State | None = asyncio.run(
        run_controller(
            config=config,
-            initial_user_action=MessageAction(content=task_str),
+            initial_user_action=initial_action,
            runtime=runtime,
        )
    )
+    
+    logger.info(f"DEBUG: run_controller returned state: {state}")
+    if state:
+        logger.info(f"DEBUG: state.agent_state: {state.agent_state}")
+        logger.info(f"DEBUG: state.history length: {len(state.history)}")
+        logger.info(f"DEBUG: Last 10 events in history:")
+        for i, event in enumerate(state.history[-10:]):
+            logger.info(f"DEBUG:   {i}: {type(event).__name__} - {event}")
+        
+        # Look for RecallActions specifically
+        recall_actions = [e for e in state.history if e.__class__.__name__ == 'RecallAction']
+        logger.info(f"DEBUG: Found {len(recall_actions)} RecallAction(s)")
+        for i, recall in enumerate(recall_actions):
+            logger.info(f"DEBUG: RecallAction {i}: {recall}")
+            
+        # Look for any observations related to RecallActions
+        recall_observations = [e for e in state.history if hasattr(e, 'cause') and any(str(e.cause) == str(r.id) for r in recall_actions)]
+        logger.info(f"DEBUG: Found {len(recall_observations)} RecallAction observation(s)")
+        for i, obs in enumerate(recall_observations):
+            logger.info(f"DEBUG: RecallAction observation {i}: {obs}")
+    
    # ======= Attempt to evaluate the agent's environment impact =======

    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
@@ -194,7 +233,8 @@ def process_instance(


 if __name__ == '__main__':
-    args = parse_arguments()
+    parser = get_evaluation_parser()
+    args = parser.parse_args()

    dataset = pd.DataFrame(
        {
@@ -216,7 +256,7 @@ if __name__ == '__main__':

    metadata = make_metadata(
        llm_config,
-        args.dataset_name,
+        'webarena',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
@@ -3,9 +3,6 @@ set -eo pipefail

 source "evaluation/utils/version_control.sh"

-# configure webarena websites and environment
-source evaluation/benchmarks/webarena/scripts/webarena_env.sh
-
 # configure browsing agent
 export USE_NAV="false"
 export USE_CONCISE_ANSWER="true"
@@ -12,7 +12,7 @@ from openhands.core.schema import ActionType
 from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
 from openhands.events.observation import BrowserOutputObservation
 from openhands.runtime.browser.base64 import png_base64_url_to_image
-from openhands.runtime.browser.browser_env import BrowserEnv
+from openhands.runtime.browser.browser_env import BrowserEnv, BROWSER_EVAL_GET_GOAL_ACTION
 from openhands.utils.async_utils import call_sync_from_async


@@ -189,7 +189,9 @@ async def browse(
        )

        # Process the content first using the axtree_object
-        observation.content = get_agent_obs_text(observation)
+        # Skip processing for GET_EVAL_GOAL action to preserve the goal text
+        if action_str != BROWSER_EVAL_GET_GOAL_ACTION:
+            observation.content = get_agent_obs_text(observation)

        # If return_axtree is False, remove the axtree_object to save space
        if not action.return_axtree:
@@ -214,10 +216,12 @@ async def browse(
        )

        # Process the content using get_agent_obs_text regardless of return_axtree value
-        try:
-            observation.content = get_agent_obs_text(observation)
-        except Exception:
-            # If get_agent_obs_text fails, keep the original error message
-            pass
+        # Skip processing for GET_EVAL_GOAL action to preserve the goal text
+        if action_str != BROWSER_EVAL_GET_GOAL_ACTION:
+            try:
+                observation.content = get_agent_obs_text(observation)
+            except Exception:
+                # If get_agent_obs_text fails, keep the original error message
+                pass

        return observation
@@ -177,7 +177,7 @@ def build_runtime_image_in_folder(
    enable_browser: bool = True,
 ) -> str:
    runtime_image_repo, _ = get_runtime_image_repo_and_tag(base_image)
-    lock_tag = f'oh_v{oh_version}_{get_hash_for_lock_files(base_image, enable_browser)}'
+    lock_tag = f'oh_v{oh_version}_{get_hash_for_lock_files(base_image, enable_browser, extra_deps)}'
    versioned_tag = (
        # truncate the base image to 96 characters to fit in the tag max length (128 characters)
        f'oh_v{oh_version}_{get_tag_for_versioned_image(base_image)}'
@@ -317,13 +317,18 @@ def truncate_hash(hash: str) -> str:
    return ''.join(result)


-def get_hash_for_lock_files(base_image: str, enable_browser: bool = True) -> str:
+def get_hash_for_lock_files(
+    base_image: str, enable_browser: bool = True, extra_deps: str | None = None
+) -> str:
    openhands_source_dir = Path(openhands.__file__).parent
    md5 = hashlib.md5()
    md5.update(base_image.encode())
    # Only include enable_browser in hash when it's False for backward compatibility
    if not enable_browser:
        md5.update(str(enable_browser).encode())
+    # Include extra dependencies in hash to ensure different deps result in different images
+    if extra_deps:
+        md5.update(extra_deps.encode())
    for file in ['pyproject.toml', 'poetry.lock']:
        src = Path(openhands_source_dir, file)
        if not src.exists():
Author	SHA1	Message	Date
openhands	3c2dce4ef3	Fix WebArena evaluation: bypass recall system for evaluation - Set MessageAction source to EventSource.ENVIRONMENT to bypass recall processing - This allows the agent to process the task directly without getting stuck in recall loop - Agent now makes LLM calls and executes commands as expected - Minimal change: only modified evaluation script, no core agent controller changes	2025-08-23 13:15:05 +00:00
openhands	2b02dec226	Clean up debug output and force rebuild flags - Remove debug print statement from runtime_build.py - Remove force_rebuild_runtime flag from WebArena evaluation - Simplify runtime_extra_deps command - Copy cleaned up files to cached location	2025-08-23 11:41:08 +00:00
openhands	44e116a5aa	Add debug output for oh_version	2025-08-23 11:23:51 +00:00
openhands	3c73316907	Update runtime_extra_deps to v4 to force new hash and rebuild	2025-08-23 11:16:55 +00:00
openhands	6ee8837ede	Fix WebArena GET_EVAL_GOAL action by preserving goal text in browser utils - Modified browse() function to skip get_agent_obs_text() processing for GET_EVAL_GOAL action - Fixed both main processing path and exception handler to preserve original goal text - Added force_rebuild_runtime flag to ensure updated browser utils are used in runtime container - This resolves the AWAITING_USER_INPUT issue where goal text was being overwritten with processed browser observation	2025-08-23 11:13:18 +00:00
openhands	1c6878d4a3	Fix WebArena evaluation by installing browsergym-webarena in runtime - Add runtime_extra_deps to install browsergym-webarena==0.13.3 in Poetry environment - Enhance runtime build system to include extra_deps in hash calculation - This ensures different extra dependencies result in different image names - Prevents incorrect caching of images without required dependencies - Removes need for force_rebuild_runtime flag Co-authored-by: openhands <openhands@all-hands.dev>	2025-08-23 10:14:34 +00:00
Graham Neubig	07488d36e5	Update webarena scripts	2025-08-22 21:24:56 -04:00