mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3c2dce4ef3 | |||
| 2b02dec226 | |||
| 44e116a5aa | |||
| 3c73316907 | |||
| 6ee8837ede | |||
| 1c6878d4a3 | |||
| 07488d36e5 |
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, '/workspace/project/OpenHands')
|
||||
|
||||
from evaluation.benchmarks.webarena.run_infer import initialize_runtime, get_config
|
||||
from evaluation.utils.shared import EvalMetadata, make_metadata
|
||||
from openhands.core.config import load_from_toml
|
||||
from openhands.runtime.impl.docker.docker_runtime import DockerRuntime
|
||||
from openhands.utils.async_utils import call_async_from_sync
|
||||
import pandas as pd
|
||||
|
||||
def debug_webarena_goal():
|
||||
"""Debug what the WebArena goal looks like"""
|
||||
|
||||
# Create a minimal instance for testing
|
||||
instance = pd.Series({
|
||||
'instance_id': 'browsergym/webarena.247',
|
||||
'instruction': 'Test instruction'
|
||||
})
|
||||
|
||||
# Load LLM config
|
||||
config_dict = load_from_toml('config.toml')
|
||||
llm_config = config_dict['llm']['claude-sonnet-4']
|
||||
|
||||
# Create metadata
|
||||
metadata = make_metadata(
|
||||
llm_config=llm_config,
|
||||
dataset_name='webarena',
|
||||
agent_class='CodeActAgent',
|
||||
max_iterations=15,
|
||||
eval_note=None,
|
||||
eval_output_dir='evaluation/evaluation_outputs/outputs/webarena/CodeActAgent/debug',
|
||||
details=None,
|
||||
)
|
||||
|
||||
config = get_config(metadata, instance.instance_id)
|
||||
|
||||
# Create runtime
|
||||
runtime = DockerRuntime(config.sandbox_config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
print("=== DEBUGGING WEBARENA GOAL ===")
|
||||
|
||||
# Get the goal
|
||||
try:
|
||||
task_str = initialize_runtime(runtime)
|
||||
print(f"Goal text type: {type(task_str)}")
|
||||
print(f"Goal text length: {len(str(task_str))}")
|
||||
print(f"Goal text content: {repr(task_str)}")
|
||||
|
||||
if task_str:
|
||||
print("✅ Goal text is not empty")
|
||||
else:
|
||||
print("❌ Goal text is empty!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error getting goal: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
try:
|
||||
call_async_from_sync(runtime.close)
|
||||
except:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_webarena_goal()
|
||||
@@ -22,8 +22,8 @@ from evaluation.utils.shared import (
|
||||
from openhands.controller.state.state import State
|
||||
from openhands.core.config import (
|
||||
OpenHandsConfig,
|
||||
get_evaluation_parser,
|
||||
get_llm_config_arg,
|
||||
parse_arguments,
|
||||
)
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.main import create_runtime, run_controller
|
||||
@@ -32,6 +32,7 @@ from openhands.events.action import (
|
||||
CmdRunAction,
|
||||
MessageAction,
|
||||
)
|
||||
from openhands.events.event import EventSource
|
||||
from openhands.events.observation import CmdOutputObservation
|
||||
from openhands.runtime.base import Runtime
|
||||
from openhands.runtime.browser.browser_env import (
|
||||
@@ -55,20 +56,23 @@ def get_config(
|
||||
sandbox_config = get_default_sandbox_config_for_eval()
|
||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||
sandbox_config.browsergym_eval_env = env_id
|
||||
# Install evaluation dependencies in the runtime container (into Poetry environment)
|
||||
sandbox_config.runtime_extra_deps = '/openhands/micromamba/bin/micromamba run -n openhands poetry run pip install browsergym-webarena==0.13.3'
|
||||
sandbox_config.runtime_startup_env_vars = {
|
||||
'BASE_URL': base_url,
|
||||
'WEBARENA_BASE_URL': base_url,
|
||||
'OPENAI_API_KEY': openai_api_key,
|
||||
'SHOPPING': f'{base_url}:7770/',
|
||||
'SHOPPING_ADMIN': f'{base_url}:7780/admin',
|
||||
'REDDIT': f'{base_url}:9999',
|
||||
'GITLAB': f'{base_url}:8023',
|
||||
'WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
|
||||
'MAP': f'{base_url}:3000',
|
||||
'HOMEPAGE': f'{base_url}:4399',
|
||||
'WA_SHOPPING': f'{base_url}:7770/',
|
||||
'WA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
|
||||
'WA_REDDIT': f'{base_url}:9999',
|
||||
'WA_GITLAB': f'{base_url}:8023',
|
||||
'WA_WIKIPEDIA': f'{base_url}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing',
|
||||
'WA_MAP': f'{base_url}:3000',
|
||||
'WA_HOMEPAGE': f'{base_url}:4399',
|
||||
}
|
||||
config = get_openhands_config_for_eval(
|
||||
metadata=metadata,
|
||||
runtime='docker',
|
||||
enable_browser=True,
|
||||
sandbox_config=sandbox_config,
|
||||
)
|
||||
config.set_llm_config(metadata.llm_config)
|
||||
@@ -79,7 +83,7 @@ def get_config(
|
||||
|
||||
def initialize_runtime(
|
||||
runtime: Runtime,
|
||||
) -> dict:
|
||||
) -> str:
|
||||
"""Initialize the runtime for the agent.
|
||||
|
||||
This function is called before the runtime is used to run the agent.
|
||||
@@ -145,13 +149,48 @@ def process_instance(
|
||||
call_async_from_sync(runtime.connect)
|
||||
task_str = initialize_runtime(runtime)
|
||||
|
||||
logger.info(f"DEBUG: task_str = {repr(task_str)}")
|
||||
logger.info(f"DEBUG: task_str type = {type(task_str)}")
|
||||
|
||||
# Use EventSource.ENVIRONMENT to bypass recall processing in evaluation
|
||||
initial_action = MessageAction(content=task_str)
|
||||
initial_action._source = EventSource.ENVIRONMENT # Bypass recall for evaluation
|
||||
logger.info(f"DEBUG: Created MessageAction: {initial_action}")
|
||||
logger.info(f"DEBUG: MessageAction content: {repr(initial_action.content)}")
|
||||
logger.info(f"DEBUG: MessageAction source: {initial_action.source}")
|
||||
|
||||
# Enable detailed logging for debugging
|
||||
import os
|
||||
os.environ['LOG_ALL_EVENTS'] = '1'
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
run_controller(
|
||||
config=config,
|
||||
initial_user_action=MessageAction(content=task_str),
|
||||
initial_user_action=initial_action,
|
||||
runtime=runtime,
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"DEBUG: run_controller returned state: {state}")
|
||||
if state:
|
||||
logger.info(f"DEBUG: state.agent_state: {state.agent_state}")
|
||||
logger.info(f"DEBUG: state.history length: {len(state.history)}")
|
||||
logger.info(f"DEBUG: Last 10 events in history:")
|
||||
for i, event in enumerate(state.history[-10:]):
|
||||
logger.info(f"DEBUG: {i}: {type(event).__name__} - {event}")
|
||||
|
||||
# Look for RecallActions specifically
|
||||
recall_actions = [e for e in state.history if e.__class__.__name__ == 'RecallAction']
|
||||
logger.info(f"DEBUG: Found {len(recall_actions)} RecallAction(s)")
|
||||
for i, recall in enumerate(recall_actions):
|
||||
logger.info(f"DEBUG: RecallAction {i}: {recall}")
|
||||
|
||||
# Look for any observations related to RecallActions
|
||||
recall_observations = [e for e in state.history if hasattr(e, 'cause') and any(str(e.cause) == str(r.id) for r in recall_actions)]
|
||||
logger.info(f"DEBUG: Found {len(recall_observations)} RecallAction observation(s)")
|
||||
for i, obs in enumerate(recall_observations):
|
||||
logger.info(f"DEBUG: RecallAction observation {i}: {obs}")
|
||||
|
||||
# ======= Attempt to evaluate the agent's environment impact =======
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
@@ -194,7 +233,8 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
parser = get_evaluation_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset = pd.DataFrame(
|
||||
{
|
||||
@@ -216,7 +256,7 @@ if __name__ == '__main__':
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
'webarena',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
|
||||
@@ -3,9 +3,6 @@ set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
# configure webarena websites and environment
|
||||
source evaluation/benchmarks/webarena/scripts/webarena_env.sh
|
||||
|
||||
# configure browsing agent
|
||||
export USE_NAV="false"
|
||||
export USE_CONCISE_ANSWER="true"
|
||||
|
||||
@@ -12,7 +12,7 @@ from openhands.core.schema import ActionType
|
||||
from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
|
||||
from openhands.events.observation import BrowserOutputObservation
|
||||
from openhands.runtime.browser.base64 import png_base64_url_to_image
|
||||
from openhands.runtime.browser.browser_env import BrowserEnv
|
||||
from openhands.runtime.browser.browser_env import BrowserEnv, BROWSER_EVAL_GET_GOAL_ACTION
|
||||
from openhands.utils.async_utils import call_sync_from_async
|
||||
|
||||
|
||||
@@ -189,7 +189,9 @@ async def browse(
|
||||
)
|
||||
|
||||
# Process the content first using the axtree_object
|
||||
observation.content = get_agent_obs_text(observation)
|
||||
# Skip processing for GET_EVAL_GOAL action to preserve the goal text
|
||||
if action_str != BROWSER_EVAL_GET_GOAL_ACTION:
|
||||
observation.content = get_agent_obs_text(observation)
|
||||
|
||||
# If return_axtree is False, remove the axtree_object to save space
|
||||
if not action.return_axtree:
|
||||
@@ -214,10 +216,12 @@ async def browse(
|
||||
)
|
||||
|
||||
# Process the content using get_agent_obs_text regardless of return_axtree value
|
||||
try:
|
||||
observation.content = get_agent_obs_text(observation)
|
||||
except Exception:
|
||||
# If get_agent_obs_text fails, keep the original error message
|
||||
pass
|
||||
# Skip processing for GET_EVAL_GOAL action to preserve the goal text
|
||||
if action_str != BROWSER_EVAL_GET_GOAL_ACTION:
|
||||
try:
|
||||
observation.content = get_agent_obs_text(observation)
|
||||
except Exception:
|
||||
# If get_agent_obs_text fails, keep the original error message
|
||||
pass
|
||||
|
||||
return observation
|
||||
|
||||
@@ -177,7 +177,7 @@ def build_runtime_image_in_folder(
|
||||
enable_browser: bool = True,
|
||||
) -> str:
|
||||
runtime_image_repo, _ = get_runtime_image_repo_and_tag(base_image)
|
||||
lock_tag = f'oh_v{oh_version}_{get_hash_for_lock_files(base_image, enable_browser)}'
|
||||
lock_tag = f'oh_v{oh_version}_{get_hash_for_lock_files(base_image, enable_browser, extra_deps)}'
|
||||
versioned_tag = (
|
||||
# truncate the base image to 96 characters to fit in the tag max length (128 characters)
|
||||
f'oh_v{oh_version}_{get_tag_for_versioned_image(base_image)}'
|
||||
@@ -317,13 +317,18 @@ def truncate_hash(hash: str) -> str:
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def get_hash_for_lock_files(base_image: str, enable_browser: bool = True) -> str:
|
||||
def get_hash_for_lock_files(
|
||||
base_image: str, enable_browser: bool = True, extra_deps: str | None = None
|
||||
) -> str:
|
||||
openhands_source_dir = Path(openhands.__file__).parent
|
||||
md5 = hashlib.md5()
|
||||
md5.update(base_image.encode())
|
||||
# Only include enable_browser in hash when it's False for backward compatibility
|
||||
if not enable_browser:
|
||||
md5.update(str(enable_browser).encode())
|
||||
# Include extra dependencies in hash to ensure different deps result in different images
|
||||
if extra_deps:
|
||||
md5.update(extra_deps.encode())
|
||||
for file in ['pyproject.toml', 'poetry.lock']:
|
||||
src = Path(openhands_source_dir, file)
|
||||
if not src.exists():
|
||||
|
||||
Reference in New Issue
Block a user