mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2026-04-29 03:00:45 -04:00
116 lines
5.1 KiB
Python
116 lines
5.1 KiB
Python
import base64
|
|
import datetime
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from PIL import Image
|
|
|
|
from openhands.core.exceptions import BrowserUnavailableException
|
|
from openhands.core.schema import ActionType
|
|
from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
|
|
from openhands.events.observation import BrowserOutputObservation
|
|
from openhands.runtime.browser.base64 import png_base64_url_to_image
|
|
from openhands.runtime.browser.browser_env import BrowserEnv
|
|
from openhands.utils.async_utils import call_sync_from_async
|
|
|
|
|
|
async def browse(
|
|
action: BrowseURLAction | BrowseInteractiveAction,
|
|
browser: BrowserEnv | None,
|
|
workspace_dir: str | None = None,
|
|
) -> BrowserOutputObservation:
|
|
if browser is None:
|
|
raise BrowserUnavailableException()
|
|
|
|
if isinstance(action, BrowseURLAction):
|
|
# legacy BrowseURLAction
|
|
asked_url = action.url
|
|
if not asked_url.startswith('http'):
|
|
asked_url = os.path.abspath(os.curdir) + action.url
|
|
action_str = f'goto("{asked_url}")'
|
|
|
|
elif isinstance(action, BrowseInteractiveAction):
|
|
# new BrowseInteractiveAction, supports full featured BrowserGym actions
|
|
# action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
|
|
action_str = action.browser_actions
|
|
else:
|
|
raise ValueError(f'Invalid action type: {action.action}')
|
|
|
|
try:
|
|
# obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
|
|
obs = await call_sync_from_async(browser.step, action_str)
|
|
|
|
# Save screenshot if workspace_dir is provided
|
|
screenshot_path = None
|
|
if workspace_dir is not None and obs.get('screenshot'):
|
|
# Create screenshots directory if it doesn't exist
|
|
screenshots_dir = Path(workspace_dir) / '.browser_screenshots'
|
|
screenshots_dir.mkdir(exist_ok=True)
|
|
|
|
# Generate a filename based on timestamp
|
|
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')
|
|
screenshot_filename = f'screenshot_{timestamp}.png'
|
|
screenshot_path = str(screenshots_dir / screenshot_filename)
|
|
|
|
# Direct image saving from base64 data without using PIL's Image.open
|
|
# This approach bypasses potential encoding issues that might occur when
|
|
# converting between different image representations, ensuring the raw PNG
|
|
# data from the browser is saved directly to disk.
|
|
|
|
# Extract the base64 data
|
|
base64_data = obs.get('screenshot', '')
|
|
if ',' in base64_data:
|
|
base64_data = base64_data.split(',')[1]
|
|
|
|
try:
|
|
# Decode base64 directly to binary
|
|
image_data = base64.b64decode(base64_data)
|
|
|
|
# Write binary data directly to file
|
|
with open(screenshot_path, 'wb') as f:
|
|
f.write(image_data)
|
|
|
|
# Verify the image was saved correctly by opening it
|
|
# This is just a verification step and can be removed in production
|
|
Image.open(screenshot_path).verify()
|
|
except Exception:
|
|
# If direct saving fails, fall back to the original method
|
|
image = png_base64_url_to_image(obs.get('screenshot'))
|
|
image.save(screenshot_path, format='PNG', optimize=True)
|
|
|
|
return BrowserOutputObservation(
|
|
content=obs['text_content'], # text content of the page
|
|
url=obs.get('url', ''), # URL of the page
|
|
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
|
|
screenshot_path=screenshot_path, # path to saved screenshot file
|
|
set_of_marks=obs.get(
|
|
'set_of_marks', None
|
|
), # base64-encoded Set-of-Marks annotated screenshot, png,
|
|
goal_image_urls=obs.get('image_content', []),
|
|
open_pages_urls=obs.get('open_pages_urls', []), # list of open pages
|
|
active_page_index=obs.get(
|
|
'active_page_index', -1
|
|
), # index of the active page
|
|
axtree_object=obs.get('axtree_object', {}), # accessibility tree object
|
|
extra_element_properties=obs.get('extra_element_properties', {}),
|
|
focused_element_bid=obs.get(
|
|
'focused_element_bid', None
|
|
), # focused element bid
|
|
last_browser_action=obs.get(
|
|
'last_action', ''
|
|
), # last browser env action performed
|
|
last_browser_action_error=obs.get('last_action_error', ''),
|
|
error=True if obs.get('last_action_error', '') else False, # error flag
|
|
trigger_by_action=action.action,
|
|
)
|
|
except Exception as e:
|
|
return BrowserOutputObservation(
|
|
content=str(e),
|
|
screenshot='',
|
|
screenshot_path=None,
|
|
error=True,
|
|
last_browser_action_error=str(e),
|
|
url=asked_url if action.action == ActionType.BROWSE else '',
|
|
trigger_by_action=action.action,
|
|
)
|