diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml
index 11098dbec..a8ba6c3c5 100644
--- a/python/packages/autogen-ext/pyproject.toml
+++ b/python/packages/autogen-ext/pyproject.toml
@@ -32,6 +32,7 @@ web-surfer = [
"autogen-agentchat==0.4.0.dev11",
"playwright>=1.48.0",
"pillow>=11.0.0",
+ "markitdown>=0.0.1a2",
]
magentic-one = [
"autogen-agentchat==0.4.0.dev11",
@@ -77,7 +78,11 @@ testpaths = ["tests"]
include = "../../shared_tasks.toml"
[tool.poe.tasks]
-test = "pytest -n auto"
+test.sequence = [
+ "playwright install",
+ "pytest -n auto",
+]
+test.default_item_type = "cmd"
mypy = "mypy --config-file ../../pyproject.toml --exclude src/autogen_ext/runtimes/grpc/protos --exclude tests/protos src tests"
[tool.mypy]
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/__init__.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/__init__.py
index 3030f7f8b..5b3efc93d 100644
--- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/__init__.py
+++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/__init__.py
@@ -1,3 +1,4 @@
from ._multimodal_web_surfer import MultimodalWebSurfer
+from .playwright_controller import PlaywrightController
-__all__ = ["MultimodalWebSurfer"]
+__all__ = ["MultimodalWebSurfer", "PlaywrightController"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py
index 94301ba7e..26907ca0b 100644
--- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py
+++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py
@@ -9,29 +9,28 @@ import time
import traceback
from typing import (
Any,
+ AsyncGenerator,
BinaryIO,
Dict,
List,
Optional,
Sequence,
- Tuple,
cast,
)
-
-# Any, Callable, Dict, List, Literal, Tuple
-from urllib.parse import quote_plus # parse_qs, quote, unquote, urlparse, urlunparse
+from urllib.parse import quote_plus
import aiofiles
import PIL.Image
from autogen_agentchat.agents import BaseChatAgent
from autogen_agentchat.base import Response
-from autogen_agentchat.messages import ChatMessage, MultiModalMessage, TextMessage
+from autogen_agentchat.messages import AgentMessage, ChatMessage, MultiModalMessage, TextMessage
from autogen_core import EVENT_LOGGER_NAME, CancellationToken, FunctionCall
from autogen_core import Image as AGImage
from autogen_core.models import (
AssistantMessage,
ChatCompletionClient,
LLMMessage,
+ RequestUsage,
SystemMessage,
UserMessage,
)
@@ -39,7 +38,6 @@ from PIL import Image
from playwright.async_api import BrowserContext, Download, Page, Playwright, async_playwright
from ._events import WebSurferEvent
-from ._playwright_controller import PlaywrightController
from ._prompts import WEB_SURFER_OCR_PROMPT, WEB_SURFER_QA_PROMPT, WEB_SURFER_QA_SYSTEM_MESSAGE, WEB_SURFER_TOOL_PROMPT
from ._set_of_mark import add_set_of_mark
from ._tool_definitions import (
@@ -49,8 +47,6 @@ from ._tool_definitions import (
TOOL_PAGE_DOWN,
TOOL_PAGE_UP,
TOOL_READ_PAGE_AND_ANSWER,
- # TOOL_SCROLL_ELEMENT_DOWN,
- # TOOL_SCROLL_ELEMENT_UP,
TOOL_SLEEP,
TOOL_SUMMARIZE_PAGE,
TOOL_TYPE,
@@ -59,67 +55,139 @@ from ._tool_definitions import (
)
from ._types import InteractiveRegion, UserContent
from ._utils import message_content_to_str
-
-# Viewport dimensions
-VIEWPORT_HEIGHT = 900
-VIEWPORT_WIDTH = 1440
-
-# Size of the image we send to the MLM
-# Current values represent a 0.85 scaling to fit within the GPT-4v short-edge constraints (768px)
-MLM_HEIGHT = 765
-MLM_WIDTH = 1224
-
-SCREENSHOT_TOKENS = 1105
+from .playwright_controller import PlaywrightController
class MultimodalWebSurfer(BaseChatAgent):
- """(In preview) A multimodal agent that acts as a web surfer that can search the web and visit web pages."""
+ """
+ MultimodalWebSurfer is a multimodal agent that acts as a web surfer that can search the web and visit web pages.
- DEFAULT_DESCRIPTION = "A helpful assistant with access to a web browser. Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, etc., filling in form fields, etc.) It can also summarize the entire page, or answer questions based on the content of the page. It can also be asked to sleep and wait for pages to load, in cases where the pages seem to be taking a while to load."
+ It launches a chromium browser and allows the playwright to interact with the web browser and can perform a variety of actions. The browser is launched on the first call to the agent and is reused for subsequent calls.
+ It must be used with a multimodal model client that supports function/tool calling, ideally GPT-4o currently.
+
+
+ When :meth:`on_messages` or :meth:`on_messages_stream` is called, the following occurs:
+ 1) If this is the first call, the browser is initialized and the page is loaded. This is done in :meth:`_lazy_init`.
+ 2) The method :meth:`_generate_reply` is called, which then creates the final response as below.
+ 3) The agent takes a screenshot of the page, extracts the interactive elements, and prepares a set-of-mark screenshot with bounding boxes around the interactive elements.
+ 4) The agent makes a call to the :attr:`model_client` with the SOM screenshot, history of messages, and the list of available tools.
+ - If the model returns a string, the agent returns the string as the final response.
+ - If the model returns a list of tool calls, the agent executes the tool calls with :meth:`_execute_tool` using :attr:`_playwright_controller`.
+ - The agent returns a final response which includes a screenshot of the page, page metadata, description of the action taken and the inner text of the webpage.
+ 5) If at any point the agent encounters an error, it returns the error message as the final response.
+
+
+ .. note::
+ Please note that using the MultimodalWebSurfer involves interacting with a digital world designed for humans, which carries inherent risks.
+ Be aware that agents may occasionally attempt risky actions, such as recruiting humans for help or accepting cookie agreements without human involvement. Always ensure agents are monitored and operate within a controlled environment to prevent unintended consequences.
+ Moreover, be cautious that MultimodalWebSurfer may be susceptible to prompt injection attacks from webpages.
+
+ Args:
+ name (str): The name of the agent.
+ model_client (ChatCompletionClient): The model client used by the agent. Must be multimodal and support function calling.
+ downloads_folder (str, optional): The folder where downloads are saved. Defaults to None, no downloads are saved.
+ description (str, optional): The description of the agent. Defaults to MultimodalWebSurfer.DEFAULT_DESCRIPTION.
+ debug_dir (str, optional): The directory where debug information is saved. Defaults to None.
+ headless (bool, optional): Whether the browser should be headless. Defaults to True.
+ start_page (str, optional): The start page for the browser. Defaults to MultimodalWebSurfer.DEFAULT_START_PAGE.
+ animate_actions (bool, optional): Whether to animate actions. Defaults to False.
+ to_save_screenshots (bool, optional): Whether to save screenshots. Defaults to False.
+ use_ocr (bool, optional): Whether to use OCR. Defaults to True.
+ browser_channel (str, optional): The browser channel. Defaults to None.
+ browser_data_dir (str, optional): The browser data directory. Defaults to None.
+ to_resize_viewport (bool, optional): Whether to resize the viewport. Defaults to True.
+ playwright (Playwright, optional): The playwright instance. Defaults to None.
+ context (BrowserContext, optional): The browser context. Defaults to None.
+
+
+
+
+ Example usage:
+
+ The following example demonstrates how to create a web surfing agent with
+ a model client and run it for multiple turns.
+
+ .. code-block:: python
+
+
+ import asyncio
+ from autogen_agentchat.ui import Console
+ from autogen_agentchat.teams import RoundRobinGroupChat
+ from autogen_ext.models.openai import OpenAIChatCompletionClient
+ from autogen_ext.agents.web_surfer import MultimodalWebSurfer
+
+
+ async def main() -> None:
+ # Define an agent
+ web_surfer_agent = MultimodalWebSurfer(
+ name="MultimodalWebSurfer",
+ model_client=OpenAIChatCompletionClient(model="gpt-4o-2024-08-06"),
+ )
+
+ # Define a team
+ agent_team = RoundRobinGroupChat([web_surfer_agent], max_turns=3)
+
+ # Run the team and stream messages to the console
+ stream = agent_team.run_stream(task="Navigate to the AutoGen readme on GitHub.")
+ await Console(stream)
+
+
+ asyncio.run(main())
+ """
+
+ DEFAULT_DESCRIPTION = """
+ A helpful assistant with access to a web browser.
+ Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, etc., filling in form fields, etc.).
+ It can also summarize the entire page, or answer questions based on the content of the page.
+ It can also be asked to sleep and wait for pages to load, in cases where the pages seem to be taking a while to load.
+ """
DEFAULT_START_PAGE = "https://www.bing.com/"
+ # Viewport dimensions
+ VIEWPORT_HEIGHT = 900
+ VIEWPORT_WIDTH = 1440
+
+ # Size of the image we send to the MLM
+ # Current values represent a 0.85 scaling to fit within the GPT-4v short-edge constraints (768px)
+ MLM_HEIGHT = 765
+ MLM_WIDTH = 1224
+
+ SCREENSHOT_TOKENS = 1105
+
def __init__(
self,
name: str,
model_client: ChatCompletionClient,
+ downloads_folder: str | None = None,
description: str = DEFAULT_DESCRIPTION,
+ debug_dir: str | None = None,
headless: bool = True,
+ start_page: str | None = DEFAULT_START_PAGE,
+ animate_actions: bool = False,
+ to_save_screenshots: bool = False,
+ use_ocr: bool = True,
browser_channel: str | None = None,
browser_data_dir: str | None = None,
- start_page: str | None = None,
- downloads_folder: str | None = None,
- debug_dir: str | None = os.getcwd(),
- to_save_screenshots: bool = False,
- animate_actions: bool = False,
- use_ocr: bool = True,
to_resize_viewport: bool = True,
playwright: Playwright | None = None,
context: BrowserContext | None = None,
):
"""
Initialize the MultimodalWebSurfer.
-
- Args:
- name (str): The agent's name
- model_client (ChatCompletionClient): The model to use (must be multi-modal)
- description (str): The agent's description used by the team. Defaults to DEFAULT_DESCRIPTION
- headless (bool): Whether to run the browser in headless mode. Defaults to True.
- browser_channel (str | type[DEFAULT_CHANNEL]): The browser channel to use. Defaults to DEFAULT_CHANNEL.
- browser_data_dir (str | None): The directory to store browser data. Defaults to None.
- start_page (str | None): The initial page to visit. Defaults to DEFAULT_START_PAGE.
- downloads_folder (str | None): The folder to save downloads. Defaults to None.
- debug_dir (str | None): The directory to save debug information. Defaults to the current working directory.
- to_save_screenshots (bool): Whether to save screenshots. Defaults to False.
- animate_actions (bool): Whether to animate actions. Defaults to False.
- use_ocr (bool): Whether to use OCR to extract text from screenshots, otherwise extract text from page. Defaults to True.
- to_resize_viewport (bool): Whether to resize the viewport. Defaults to True.
- playwright (Playwright | None): The playwright instance to use. Defaults to None and creates a new one.
- context (BrowserContext | None): The browser context to use. Defaults to None and creates a new one.
"""
super().__init__(name, description)
+ if debug_dir is None and to_save_screenshots:
+ raise ValueError(
+ "Cannot save screenshots without a debug directory. Set it using the 'debug_dir' parameter. The debug directory is created if it does not exist."
+ )
+ if model_client.capabilities["function_calling"] is False:
+ raise ValueError(
+ "The model does not support function calling. MultimodalWebSurfer requires a model that supports function calling."
+ )
+ if model_client.capabilities["vision"] is False:
+ raise ValueError("The model is not multimodal. MultimodalWebSurfer requires a multimodal model.")
self._model_client = model_client
-
self.headless = headless
self.browser_channel = browser_channel
self.browser_data_dir = browser_data_dir
@@ -150,8 +218,8 @@ class MultimodalWebSurfer(BaseChatAgent):
self._playwright_controller = PlaywrightController(
animate_actions=self.animate_actions,
downloads_folder=self.downloads_folder,
- viewport_width=VIEWPORT_WIDTH,
- viewport_height=VIEWPORT_HEIGHT,
+ viewport_width=self.VIEWPORT_WIDTH,
+ viewport_height=self.VIEWPORT_HEIGHT,
_download_handler=self._download_handler,
to_resize_viewport=self.to_resize_viewport,
)
@@ -166,32 +234,79 @@ class MultimodalWebSurfer(BaseChatAgent):
TOOL_SLEEP,
TOOL_HOVER,
]
- self.did_lazy_init = False
+ self.n_lines_page_text = 50 # Number of lines of text to extract from the page in the absence of OCR
+ self.did_lazy_init = False # flag to check if we have initialized the browser
+
+ async def _lazy_init(
+ self,
+ ) -> None:
+ """
+ On the first call, we initialize the browser and the page.
+ """
+ self._last_download = None
+ self._prior_metadata_hash = None
+
+ # Create the playwright self
+ launch_args: Dict[str, Any] = {"headless": self.headless}
+ if self.browser_channel is not None:
+ launch_args["channel"] = self.browser_channel
+ if self._playwright is None:
+ self._playwright = await async_playwright().start()
+
+ # Create the context -- are we launching persistent?
+ if self._context is None:
+ if self.browser_data_dir is None:
+ browser = await self._playwright.chromium.launch(**launch_args)
+ self._context = await browser.new_context(
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
+ )
+ else:
+ self._context = await self._playwright.chromium.launch_persistent_context(
+ self.browser_data_dir, **launch_args
+ )
+
+ # Create the page
+ self._context.set_default_timeout(60000) # One minute
+ self._page = await self._context.new_page()
+ assert self._page is not None
+ # self._page.route(lambda x: True, self._route_handler)
+ self._page.on("download", self._download_handler)
+ if self.to_resize_viewport:
+ await self._page.set_viewport_size({"width": self.VIEWPORT_WIDTH, "height": self.VIEWPORT_HEIGHT})
+ await self._page.add_init_script(
+ path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js")
+ )
+ await self._page.goto(self.start_page)
+ await self._page.wait_for_load_state()
+
+ # Prepare the debug directory -- which stores the screenshots generated throughout the process
+ await self._set_debug_dir(self.debug_dir)
+ self.did_lazy_init = True
+
+ async def _set_debug_dir(self, debug_dir: str | None) -> None:
+ assert self._page is not None
+ if self.debug_dir is None:
+ return
+
+ if not os.path.isdir(self.debug_dir):
+ os.mkdir(self.debug_dir)
+
+ if self.to_save_screenshots:
+ current_timestamp = "_" + int(time.time()).__str__()
+ screenshot_png_name = "screenshot" + current_timestamp + ".png"
+ await self._page.screenshot(path=os.path.join(self.debug_dir, screenshot_png_name))
+ self.logger.info(
+ WebSurferEvent(
+ source=self.name,
+ url=self._page.url,
+ message="Screenshot: " + screenshot_png_name,
+ )
+ )
@property
def produced_message_types(self) -> List[type[ChatMessage]]:
return [MultiModalMessage]
- async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
- for chat_message in messages:
- if isinstance(chat_message, TextMessage | MultiModalMessage):
- self._chat_history.append(UserMessage(content=chat_message.content, source=chat_message.source))
- else:
- raise ValueError(f"Unexpected message in MultiModalWebSurfer: {chat_message}")
-
- try:
- _, content = await self.__generate_reply(cancellation_token=cancellation_token)
- self._chat_history.append(AssistantMessage(content=message_content_to_str(content), source=self.name))
- if isinstance(content, str):
- return Response(chat_message=TextMessage(content=content, source=self.name))
- else:
- return Response(chat_message=MultiModalMessage(content=content, source=self.name))
-
- except BaseException:
- content = f"Web surfing error:\n\n{traceback.format_exc()}"
- self._chat_history.append(AssistantMessage(content=content, source=self.name))
- return Response(chat_message=TextMessage(content=content, source=self.name))
-
async def on_reset(self, cancellation_token: CancellationToken) -> None:
if not self.did_lazy_init:
return
@@ -225,62 +340,81 @@ class MultimodalWebSurfer(BaseChatAgent):
)
)
- async def _lazy_init(
- self,
- ) -> None:
- self._last_download = None
- self._prior_metadata_hash = None
+ async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
+ async for message in self.on_messages_stream(messages, cancellation_token):
+ if isinstance(message, Response):
+ return message
+ raise AssertionError("The stream should have returned the final result.")
- # Create the playwright self
- launch_args: Dict[str, Any] = {"headless": self.headless}
- if self.browser_channel is not None:
- launch_args["channel"] = self.browser_channel
- if self._playwright is None:
- self._playwright = await async_playwright().start()
-
- # Create the context -- are we launching persistent?
- if self._context is None:
- if self.browser_data_dir is None:
- browser = await self._playwright.chromium.launch(**launch_args)
- self._context = await browser.new_context(
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
+ async def on_messages_stream(
+ self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken
+ ) -> AsyncGenerator[AgentMessage | Response, None]:
+ for chat_message in messages:
+ if isinstance(chat_message, TextMessage | MultiModalMessage):
+ self._chat_history.append(UserMessage(content=chat_message.content, source=chat_message.source))
+ else:
+ raise ValueError(f"Unexpected message in MultiModalWebSurfer: {chat_message}")
+ self.inner_messages: List[AgentMessage] = []
+ self.model_usage: List[RequestUsage] = []
+ try:
+ content = await self._generate_reply(cancellation_token=cancellation_token)
+ self._chat_history.append(AssistantMessage(content=message_content_to_str(content), source=self.name))
+ final_usage = RequestUsage(
+ prompt_tokens=sum([u.prompt_tokens for u in self.model_usage]),
+ completion_tokens=sum([u.completion_tokens for u in self.model_usage]),
+ )
+ if isinstance(content, str):
+ yield Response(
+ chat_message=TextMessage(content=content, source=self.name, models_usage=final_usage),
+ inner_messages=self.inner_messages,
)
else:
- self._context = await self._playwright.chromium.launch_persistent_context(
- self.browser_data_dir, **launch_args
+ yield Response(
+ chat_message=MultiModalMessage(content=content, source=self.name, models_usage=final_usage),
+ inner_messages=self.inner_messages,
)
- # Create the page
- self._context.set_default_timeout(60000) # One minute
- self._page = await self._context.new_page()
+ except BaseException:
+ content = f"Web surfing error:\n\n{traceback.format_exc()}"
+ self._chat_history.append(AssistantMessage(content=content, source=self.name))
+ yield Response(chat_message=TextMessage(content=content, source=self.name))
+
+ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserContent:
+ """Generates the actual reply. First calls the LLM to figure out which tool to use, then executes the tool."""
+
+ # Lazy init, initialize the browser and the page on the first generate reply only
+ if not self.did_lazy_init:
+ await self._lazy_init()
+
assert self._page is not None
- # self._page.route(lambda x: True, self._route_handler)
- self._page.on("download", self._download_handler)
- if self.to_resize_viewport:
- await self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT})
- await self._page.add_init_script(
- path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js")
- )
- await self._page.goto(self.start_page)
- await self._page.wait_for_load_state()
- # Prepare the debug directory -- which stores the screenshots generated throughout the process
- await self._set_debug_dir(self.debug_dir)
- self.did_lazy_init = True
+ # Clone the messages to give context, removing old screenshots
+ history: List[LLMMessage] = []
+ for m in self._chat_history:
+ assert isinstance(m, UserMessage | AssistantMessage | SystemMessage)
+ assert isinstance(m.content, str | list)
- async def _set_debug_dir(self, debug_dir: str | None) -> None:
- assert self._page is not None
- self.debug_dir = debug_dir
- if self.debug_dir is None:
- return
+ if isinstance(m.content, str):
+ history.append(m)
+ else:
+ content = message_content_to_str(m.content)
+ if isinstance(m, UserMessage):
+ history.append(UserMessage(content=content, source=m.source))
+ elif isinstance(m, AssistantMessage):
+ history.append(AssistantMessage(content=content, source=m.source))
+ elif isinstance(m, SystemMessage):
+ history.append(SystemMessage(content=content))
- if not os.path.isdir(self.debug_dir):
- os.mkdir(self.debug_dir)
+ # Ask the page for interactive elements, then prepare the state-of-mark screenshot
+ rects = await self._playwright_controller.get_interactive_rects(self._page)
+ viewport = await self._playwright_controller.get_visual_viewport(self._page)
+ screenshot = await self._page.screenshot()
+ som_screenshot, visible_rects, rects_above, rects_below = add_set_of_mark(screenshot, rects)
if self.to_save_screenshots:
current_timestamp = "_" + int(time.time()).__str__()
- screenshot_png_name = "screenshot" + current_timestamp + ".png"
- await self._page.screenshot(path=os.path.join(self.debug_dir, screenshot_png_name))
+ screenshot_png_name = "screenshot_som" + current_timestamp + ".png"
+ som_screenshot.save(os.path.join(self.debug_dir, screenshot_png_name)) # type: ignore
self.logger.info(
WebSurferEvent(
source=self.name,
@@ -288,34 +422,83 @@ class MultimodalWebSurfer(BaseChatAgent):
message="Screenshot: " + screenshot_png_name,
)
)
+ # What tools are available?
+ tools = self.default_tools.copy()
- def _target_name(self, target: str, rects: Dict[str, InteractiveRegion]) -> str | None:
- try:
- return rects[target]["aria_name"].strip()
- except KeyError:
- return None
+ # We can scroll up
+ if viewport["pageTop"] > 5:
+ tools.append(TOOL_PAGE_UP)
- def _format_target_list(self, ids: List[str], rects: Dict[str, InteractiveRegion]) -> List[str]:
- targets: List[str] = []
- for r in list(set(ids)):
- if r in rects:
- # Get the role
- aria_role = rects[r].get("role", "").strip()
- if len(aria_role) == 0:
- aria_role = rects[r].get("tag_name", "").strip()
+ # Can scroll down
+ if (viewport["pageTop"] + viewport["height"] + 5) < viewport["scrollHeight"]:
+ tools.append(TOOL_PAGE_DOWN)
- # Get the name
- aria_name = re.sub(r"[\n\r]+", " ", rects[r].get("aria_name", "")).strip()
+ # Focus hint
+ focused = await self._playwright_controller.get_focused_rect_id(self._page)
+ focused_hint = ""
+ if focused:
+ name = self._target_name(focused, rects)
+ if name:
+ name = f"(and name '{name}') "
- # What are the actions?
- actions = ['"click", "hover"']
- if rects[r]["role"] in ["textbox", "searchbox", "search"]:
- actions = ['"input_text"']
- actions_str = "[" + ",".join(actions) + "]"
+ role = "control"
+ try:
+ role = rects[focused]["role"]
+ except KeyError:
+ pass
- targets.append(f'{{"id": {r}, "name": "{aria_name}", "role": "{aria_role}", "tools": {actions_str} }}')
+ focused_hint = f"\nThe {role} with ID {focused} {name}currently has the input focus.\n\n"
- return targets
+ # Everything visible
+ visible_targets = "\n".join(self._format_target_list(visible_rects, rects)) + "\n\n"
+
+ # Everything else
+ other_targets: List[str] = []
+ other_targets.extend(self._format_target_list(rects_above, rects))
+ other_targets.extend(self._format_target_list(rects_below, rects))
+
+ if len(other_targets) > 0:
+ other_targets_str = (
+ "Additional valid interaction targets (not shown) include:\n" + "\n".join(other_targets) + "\n\n"
+ )
+ else:
+ other_targets_str = ""
+
+ tool_names = "\n".join([t["name"] for t in tools])
+
+ text_prompt = WEB_SURFER_TOOL_PROMPT.format(
+ url=self._page.url,
+ visible_targets=visible_targets,
+ other_targets_str=other_targets_str,
+ focused_hint=focused_hint,
+ tool_names=tool_names,
+ ).strip()
+
+ # Scale the screenshot for the MLM, and close the original
+ scaled_screenshot = som_screenshot.resize((self.MLM_WIDTH, self.MLM_HEIGHT))
+ som_screenshot.close()
+ if self.to_save_screenshots:
+ scaled_screenshot.save(os.path.join(self.debug_dir, "screenshot_scaled.png")) # type: ignore
+
+ # Add the multimodal message and make the request
+ history.append(UserMessage(content=[text_prompt, AGImage.from_pil(scaled_screenshot)], source=self.name))
+
+ response = await self._model_client.create(
+ history, tools=tools, extra_create_args={"tool_choice": "auto"}, cancellation_token=cancellation_token
+ ) # , "parallel_tool_calls": False})
+ self.model_usage.append(response.usage)
+ message = response.content
+ self._last_download = None
+ if isinstance(message, str):
+ # Answer directly
+ self.inner_messages.append(TextMessage(content=message, source=self.name))
+ return message
+ elif isinstance(message, list):
+ # Take an action
+ return await self._execute_tool(message, rects, tool_names, cancellation_token=cancellation_token)
+ else:
+ # Not sure what happened here
+ raise AssertionError(f"Unknown response format '{message}'")
async def _execute_tool(
self,
@@ -323,7 +506,8 @@ class MultimodalWebSurfer(BaseChatAgent):
rects: Dict[str, InteractiveRegion],
tool_names: str,
cancellation_token: Optional[CancellationToken] = None,
- ) -> Tuple[bool, UserContent]:
+ ) -> UserContent:
+ # Execute the tool
name = message[0].name
args = json.loads(message[0].arguments)
action_description = ""
@@ -337,6 +521,7 @@ class MultimodalWebSurfer(BaseChatAgent):
message=f"{name}( {json.dumps(args)} )",
)
)
+ self.inner_messages.append(TextMessage(content=f"{name}( {json.dumps(args)} )", source=self.name))
if name == "visit_url":
url = args.get("url")
@@ -435,11 +620,11 @@ class MultimodalWebSurfer(BaseChatAgent):
question = str(args.get("question"))
action_description = f"I answered the following question '{question}' based on the web page."
# Do Q&A on the DOM. No need to take further action. Browser state does not change.
- return False, await self._summarize_page(question=question, cancellation_token=cancellation_token)
+ return await self._summarize_page(question=question, cancellation_token=cancellation_token)
elif name == "summarize_page":
# Summarize the DOM. No need to take further action. Browser state does not change.
action_description = "I summarized the current web page"
- return False, await self._summarize_page(cancellation_token=cancellation_token)
+ return await self._summarize_page(cancellation_token=cancellation_token)
elif name == "hover":
target_id = str(args.get("target_id"))
@@ -452,18 +637,17 @@ class MultimodalWebSurfer(BaseChatAgent):
elif name == "sleep":
action_description = "I am waiting a short period of time before taking further action."
- await self._playwright_controller.sleep(self._page, 3) # There's a 2s sleep below too
+ await self._playwright_controller.sleep(self._page, 3)
else:
raise ValueError(f"Unknown tool '{name}'. Please choose from:\n\n{tool_names}")
await self._page.wait_for_load_state()
- await self._playwright_controller.sleep(self._page, 3) # There's a 2s sleep below too
+ await self._playwright_controller.sleep(self._page, 3)
# Handle downloads
if self._last_download is not None and self.downloads_folder is not None:
fname = os.path.join(self.downloads_folder, self._last_download.suggested_filename)
- # TODO: Fix this type
await self._last_download.save_as(fname) # type: ignore
page_body = f"
Download Successful
Successfully downloaded '{self._last_download.suggested_filename}' to local path:
{fname}
"
await self._page.goto(
@@ -510,158 +694,66 @@ class MultimodalWebSurfer(BaseChatAgent):
ocr_text = (
await self._get_ocr_text(new_screenshot, cancellation_token=cancellation_token)
if self.use_ocr is True
- else await self._playwright_controller.get_webpage_text(self._page)
+ else await self._playwright_controller.get_webpage_text(self._page, n_lines=self.n_lines_page_text)
)
# Return the complete observation
- message_content = "" # message.content or ""
page_title = await self._page.title()
+ message_content = f"{action_description}\n\n Here is a screenshot of the webpage: [{page_title}]({self._page.url}).\n The viewport shows {percent_visible}% of the webpage, and is positioned {position_text} {page_metadata}\n"
+ if self.use_ocr:
+ message_content += f"Automatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}"
+ else:
+ message_content += f"The first {self.n_lines_page_text} lines of the page text is:\n\n{ocr_text}"
- return False, [
- f"{message_content}\n\n{action_description}\n\nHere is a screenshot of [{page_title}]({self._page.url}). The viewport shows {percent_visible}% of the webpage, and is positioned {position_text}.{page_metadata}\nAutomatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}".strip(),
+ return [
+ message_content,
AGImage.from_pil(PIL.Image.open(io.BytesIO(new_screenshot))),
]
- async def __generate_reply(self, cancellation_token: CancellationToken) -> Tuple[bool, UserContent]:
- """Generates the actual reply. First calls the LLM to figure out which tool to use, then executes the tool."""
+ def _target_name(self, target: str, rects: Dict[str, InteractiveRegion]) -> str | None:
+ try:
+ return rects[target]["aria_name"].strip()
+ except KeyError:
+ return None
- # Lazy init
- if not self.did_lazy_init:
- await self._lazy_init()
+ def _format_target_list(self, ids: List[str], rects: Dict[str, InteractiveRegion]) -> List[str]:
+ """
+ Format the list of targets in the webpage as a string to be used in the agent's prompt.
+ """
+ targets: List[str] = []
+ for r in list(set(ids)):
+ if r in rects:
+ # Get the role
+ aria_role = rects[r].get("role", "").strip()
+ if len(aria_role) == 0:
+ aria_role = rects[r].get("tag_name", "").strip()
- assert self._page is not None
+ # Get the name
+ aria_name = re.sub(r"[\n\r]+", " ", rects[r].get("aria_name", "")).strip()
- # Clone the messages to give context, removing old screenshots
- history: List[LLMMessage] = []
- for m in self._chat_history:
- assert isinstance(m, UserMessage | AssistantMessage | SystemMessage)
- assert isinstance(m.content, str | list)
+ # What are the actions?
+ actions = ['"click", "hover"']
+ if rects[r]["role"] in ["textbox", "searchbox", "search"]:
+ actions = ['"input_text"']
+ actions_str = "[" + ",".join(actions) + "]"
- if isinstance(m.content, str):
- history.append(m)
- else:
- content = message_content_to_str(m.content)
- if isinstance(m, UserMessage):
- history.append(UserMessage(content=content, source=m.source))
- elif isinstance(m, AssistantMessage):
- history.append(AssistantMessage(content=content, source=m.source))
- elif isinstance(m, SystemMessage):
- history.append(SystemMessage(content=content))
+ targets.append(f'{{"id": {r}, "name": "{aria_name}", "role": "{aria_role}", "tools": {actions_str} }}')
- # Ask the page for interactive elements, then prepare the state-of-mark screenshot
- rects = await self._playwright_controller.get_interactive_rects(self._page)
- viewport = await self._playwright_controller.get_visual_viewport(self._page)
- screenshot = await self._page.screenshot()
- som_screenshot, visible_rects, rects_above, rects_below = add_set_of_mark(screenshot, rects)
-
- if self.to_save_screenshots:
- current_timestamp = "_" + int(time.time()).__str__()
- screenshot_png_name = "screenshot_som" + current_timestamp + ".png"
- som_screenshot.save(os.path.join(self.debug_dir, screenshot_png_name)) # type: ignore
- self.logger.info(
- WebSurferEvent(
- source=self.name,
- url=self._page.url,
- message="Screenshot: " + screenshot_png_name,
- )
- )
- # What tools are available?
- tools = self.default_tools.copy()
-
- # We can scroll up
- if viewport["pageTop"] > 5:
- tools.append(TOOL_PAGE_UP)
-
- # Can scroll down
- if (viewport["pageTop"] + viewport["height"] + 5) < viewport["scrollHeight"]:
- tools.append(TOOL_PAGE_DOWN)
-
- # Focus hint
- focused = await self._playwright_controller.get_focused_rect_id(self._page)
- focused_hint = ""
- if focused:
- name = self._target_name(focused, rects)
- if name:
- name = f"(and name '{name}') "
-
- role = "control"
- try:
- role = rects[focused]["role"]
- except KeyError:
- pass
-
- focused_hint = f"\nThe {role} with ID {focused} {name}currently has the input focus.\n\n"
-
- # Everything visible
- visible_targets = "\n".join(self._format_target_list(visible_rects, rects)) + "\n\n"
-
- # Everything else
- other_targets: List[str] = []
- other_targets.extend(self._format_target_list(rects_above, rects))
- other_targets.extend(self._format_target_list(rects_below, rects))
-
- if len(other_targets) > 0:
- other_targets_str = (
- "Additional valid interaction targets (not shown) include:\n" + "\n".join(other_targets) + "\n\n"
- )
- else:
- other_targets_str = ""
-
- # If there are scrollable elements, then add the corresponding tools
- # has_scrollable_elements = False
- # if has_scrollable_elements:
- # tools.append(TOOL_SCROLL_ELEMENT_UP)
- # tools.append(TOOL_SCROLL_ELEMENT_DOWN)
-
- tool_names = "\n".join([t["name"] for t in tools])
-
- text_prompt = WEB_SURFER_TOOL_PROMPT.format(
- url=self._page.url,
- visible_targets=visible_targets,
- other_targets_str=other_targets_str,
- focused_hint=focused_hint,
- tool_names=tool_names,
- ).strip()
-
- # Scale the screenshot for the MLM, and close the original
- scaled_screenshot = som_screenshot.resize((MLM_WIDTH, MLM_HEIGHT))
- som_screenshot.close()
- if self.to_save_screenshots:
- scaled_screenshot.save(os.path.join(self.debug_dir, "screenshot_scaled.png")) # type: ignore
-
- # Add the multimodal message and make the request
- history.append(UserMessage(content=[text_prompt, AGImage.from_pil(scaled_screenshot)], source=self.name))
-
- response = await self._model_client.create(
- history, tools=tools, extra_create_args={"tool_choice": "auto"}, cancellation_token=cancellation_token
- ) # , "parallel_tool_calls": False})
- message = response.content
- self._last_download = None
-
- if isinstance(message, str):
- # Answer directly
- return False, message
- elif isinstance(message, list):
- # Take an action
- return await self._execute_tool(message, rects, tool_names, cancellation_token=cancellation_token)
- else:
- # Not sure what happened here
- raise AssertionError(f"Unknown response format '{message}'")
+ return targets
async def _get_ocr_text(
self, image: bytes | io.BufferedIOBase | PIL.Image.Image, cancellation_token: Optional[CancellationToken] = None
) -> str:
scaled_screenshot = None
if isinstance(image, PIL.Image.Image):
- scaled_screenshot = image.resize((MLM_WIDTH, MLM_HEIGHT))
+ scaled_screenshot = image.resize((self.MLM_WIDTH, self.MLM_HEIGHT))
else:
pil_image = None
if not isinstance(image, io.BufferedIOBase):
pil_image = PIL.Image.open(io.BytesIO(image))
else:
- # TODO: Not sure why this cast was needed, but by this point screenshot is a binary file-like object
pil_image = PIL.Image.open(cast(BinaryIO, image))
- scaled_screenshot = pil_image.resize((MLM_WIDTH, MLM_HEIGHT))
+ scaled_screenshot = pil_image.resize((self.MLM_WIDTH, self.MLM_HEIGHT))
pil_image.close()
# Add the multimodal message and make the request
@@ -676,6 +768,7 @@ class MultimodalWebSurfer(BaseChatAgent):
)
)
response = await self._model_client.create(messages, cancellation_token=cancellation_token)
+ self.model_usage.append(response.usage)
scaled_screenshot.close()
assert isinstance(response.content, str)
return response.content
@@ -697,7 +790,7 @@ class MultimodalWebSurfer(BaseChatAgent):
# Take a screenshot and scale it
screenshot = Image.open(io.BytesIO(await self._page.screenshot()))
- scaled_screenshot = screenshot.resize((MLM_WIDTH, MLM_HEIGHT))
+ scaled_screenshot = screenshot.resize((self.MLM_WIDTH, self.MLM_HEIGHT))
screenshot.close()
ag_image = AGImage.from_pil(scaled_screenshot)
@@ -718,7 +811,7 @@ class MultimodalWebSurfer(BaseChatAgent):
)
remaining = self._model_client.remaining_tokens(messages + [message])
- if remaining > SCREENSHOT_TOKENS:
+ if remaining > self.SCREENSHOT_TOKENS:
buffer += line
else:
break
@@ -741,6 +834,7 @@ class MultimodalWebSurfer(BaseChatAgent):
# Generate the response
response = await self._model_client.create(messages, cancellation_token=cancellation_token)
+ self.model_usage.append(response.usage)
scaled_screenshot.close()
assert isinstance(response.content, str)
return response.content
diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_playwright_controller.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py
similarity index 69%
rename from python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_playwright_controller.py
rename to python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py
index 162980af1..d691826a4 100644
--- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_playwright_controller.py
+++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py
@@ -1,9 +1,15 @@
import asyncio
import base64
+import io
import os
import random
from typing import Any, Callable, Dict, Optional, Tuple, Union, cast
+# TODO: Fix unfollowed import
+try:
+ from markitdown import MarkItDown # type: ignore
+except ImportError:
+ MarkItDown = None
from playwright._impl._errors import Error as PlaywrightError
from playwright._impl._errors import TimeoutError
from playwright.async_api import Download, Page
@@ -17,24 +23,36 @@ from ._types import (
class PlaywrightController:
+ """
+ A helper class to allow Playwright to interact with web pages to perform actions such as clicking, filling, and scrolling.
+
+ Args:
+ downloads_folder (str | None): The folder to save downloads to. If None, downloads are not saved.
+ animate_actions (bool): Whether to animate the actions (create fake cursor to click).
+ viewport_width (int): The width of the viewport.
+ viewport_height (int): The height of the viewport.
+ _download_handler (Optional[Callable[[Download], None]]): A function to handle downloads.
+ to_resize_viewport (bool): Whether to resize the viewport
+ """
+
def __init__(
self,
+ downloads_folder: str | None = None,
animate_actions: bool = False,
- downloads_folder: Optional[str] = None,
viewport_width: int = 1440,
viewport_height: int = 900,
_download_handler: Optional[Callable[[Download], None]] = None,
to_resize_viewport: bool = True,
) -> None:
"""
- A controller for Playwright to interact with web pages.
- animate_actions: If True, actions will be animated.
- downloads_folder: The folder to save downloads to.
- viewport_width: The width of the viewport.
- viewport_height: The height of the viewport.
- _download_handler: A handler for downloads.
- to_resize_viewport: If True, the viewport will be resized.
+ Initialize the PlaywrightController.
"""
+ assert isinstance(animate_actions, bool)
+ assert isinstance(viewport_width, int)
+ assert isinstance(viewport_height, int)
+ assert viewport_height > 0
+ assert viewport_width > 0
+
self.animate_actions = animate_actions
self.downloads_folder = downloads_folder
self.viewport_width = viewport_width
@@ -43,16 +61,33 @@ class PlaywrightController:
self.to_resize_viewport = to_resize_viewport
self._page_script: str = ""
self.last_cursor_position: Tuple[float, float] = (0.0, 0.0)
+ self._markdown_converter: Optional[Any] | None = None
# Read page_script
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
self._page_script = fh.read()
async def sleep(self, page: Page, duration: Union[int, float]) -> None:
+ """
+ Pause the execution for a specified duration.
+
+ Args:
+ page (Page): The Playwright page object.
+ duration (Union[int, float]): The duration to sleep in milliseconds.
+ """
assert page is not None
await page.wait_for_timeout(duration * 1000)
async def get_interactive_rects(self, page: Page) -> Dict[str, InteractiveRegion]:
+ """
+ Retrieve interactive regions from the web page.
+
+ Args:
+ page (Page): The Playwright page object.
+
+ Returns:
+ Dict[str, InteractiveRegion]: A dictionary of interactive regions.
+ """
assert page is not None
# Read the regions from the DOM
try:
@@ -71,6 +106,15 @@ class PlaywrightController:
return typed_results
async def get_visual_viewport(self, page: Page) -> VisualViewport:
+ """
+ Retrieve the visual viewport of the web page.
+
+ Args:
+ page (Page): The Playwright page object.
+
+ Returns:
+ VisualViewport: The visual viewport of the page.
+ """
assert page is not None
try:
await page.evaluate(self._page_script)
@@ -79,6 +123,15 @@ class PlaywrightController:
return visualviewport_from_dict(await page.evaluate("MultimodalWebSurfer.getVisualViewport();"))
async def get_focused_rect_id(self, page: Page) -> str:
+ """
+ Retrieve the ID of the currently focused element.
+
+ Args:
+ page (Page): The Playwright page object.
+
+ Returns:
+ str: The ID of the focused element.
+ """
assert page is not None
try:
await page.evaluate(self._page_script)
@@ -88,6 +141,15 @@ class PlaywrightController:
return str(result)
async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
+ """
+ Retrieve metadata from the web page.
+
+ Args:
+ page (Page): The Playwright page object.
+
+ Returns:
+ Dict[str, Any]: A dictionary of page metadata.
+ """
assert page is not None
try:
await page.evaluate(self._page_script)
@@ -98,6 +160,12 @@ class PlaywrightController:
return cast(Dict[str, Any], result)
async def on_new_page(self, page: Page) -> None:
+ """
+ Handle actions to perform on a new page.
+
+ Args:
+ page (Page): The Playwright page object.
+ """
assert page is not None
page.on("download", self._download_handler) # type: ignore
if self.to_resize_viewport and self.viewport_width and self.viewport_height:
@@ -107,10 +175,26 @@ class PlaywrightController:
await page.wait_for_load_state()
async def back(self, page: Page) -> None:
+ """
+ Navigate back to the previous page.
+
+ Args:
+ page (Page): The Playwright page object.
+ """
assert page is not None
await page.go_back()
async def visit_page(self, page: Page, url: str) -> Tuple[bool, bool]:
+ """
+ Visit a specified URL.
+
+ Args:
+ page (Page): The Playwright page object.
+ url (str): The URL to visit.
+
+ Returns:
+ Tuple[bool, bool]: A tuple indicating whether to reset prior metadata hash and last download.
+ """
assert page is not None
reset_prior_metadata_hash = False
reset_last_download = False
@@ -143,16 +227,38 @@ class PlaywrightController:
return reset_prior_metadata_hash, reset_last_download
async def page_down(self, page: Page) -> None:
+ """
+ Scroll the page down by one viewport height minus 50 pixels.
+
+ Args:
+ page (Page): The Playwright page object.
+ """
assert page is not None
await page.evaluate(f"window.scrollBy(0, {self.viewport_height-50});")
async def page_up(self, page: Page) -> None:
+ """
+ Scroll the page up by one viewport height minus 50 pixels.
+
+ Args:
+ page (Page): The Playwright page object.
+ """
assert page is not None
await page.evaluate(f"window.scrollBy(0, -{self.viewport_height-50});")
async def gradual_cursor_animation(
self, page: Page, start_x: float, start_y: float, end_x: float, end_y: float
) -> None:
+ """
+ Animate the cursor movement gradually from start to end coordinates.
+
+ Args:
+ page (Page): The Playwright page object.
+ start_x (float): The starting x-coordinate.
+ start_y (float): The starting y-coordinate.
+ end_x (float): The ending x-coordinate.
+ end_y (float): The ending y-coordinate.
+ """
# animation helper
steps = 20
for step in range(steps):
@@ -171,6 +277,13 @@ class PlaywrightController:
self.last_cursor_position = (end_x, end_y)
async def add_cursor_box(self, page: Page, identifier: str) -> None:
+ """
+ Add a red cursor box around the element with the given identifier.
+
+ Args:
+ page (Page): The Playwright page object.
+ identifier (str): The element identifier.
+ """
# animation helper
await page.evaluate(f"""
(function() {{
@@ -199,6 +312,13 @@ class PlaywrightController:
""")
async def remove_cursor_box(self, page: Page, identifier: str) -> None:
+ """
+ Remove the red cursor box around the element with the given identifier.
+
+ Args:
+ page (Page): The Playwright page object.
+ identifier (str): The element identifier.
+ """
# Remove the highlight and cursor
await page.evaluate(f"""
(function() {{
@@ -215,7 +335,14 @@ class PlaywrightController:
async def click_id(self, page: Page, identifier: str) -> Page | None:
"""
- Returns new page if a new page is opened, otherwise None.
+ Click the element with the given identifier.
+
+ Args:
+ page (Page): The Playwright page object.
+ identifier (str): The element identifier.
+
+ Returns:
+ Page | None: The new page if a new page is opened, otherwise None.
"""
new_page: Page | None = None
assert page is not None
@@ -266,7 +393,11 @@ class PlaywrightController:
async def hover_id(self, page: Page, identifier: str) -> None:
"""
- Hovers the mouse over the target with the given id.
+ Hover the mouse over the element with the given identifier.
+
+ Args:
+ page (Page): The Playwright page object.
+ identifier (str): The element identifier.
"""
assert page is not None
target = page.locator(f"[__elementId='{identifier}']")
@@ -296,7 +427,15 @@ class PlaywrightController:
else:
await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)
- async def fill_id(self, page: Page, identifier: str, value: str) -> None:
+ async def fill_id(self, page: Page, identifier: str, value: str, press_enter: bool = True) -> None:
+ """
+ Fill the element with the given identifier with the specified value.
+
+ Args:
+ page (Page): The Playwright page object.
+ identifier (str): The element identifier.
+ value (str): The value to fill.
+ """
assert page is not None
target = page.locator(f"[__elementId='{identifier}']")
@@ -332,12 +471,21 @@ class PlaywrightController:
await target.fill(value)
except PlaywrightError:
await target.press_sequentially(value)
- await target.press("Enter")
+ if press_enter:
+ await target.press("Enter")
if self.animate_actions:
await self.remove_cursor_box(page, identifier)
async def scroll_id(self, page: Page, identifier: str, direction: str) -> None:
+ """
+ Scroll the element with the given identifier in the specified direction.
+
+ Args:
+ page (Page): The Playwright page object.
+ identifier (str): The element identifier.
+ direction (str): The direction to scroll ("up" or "down").
+ """
assert page is not None
await page.evaluate(
f"""
@@ -355,11 +503,16 @@ class PlaywrightController:
"""
)
- async def get_webpage_text(self, page: Page, n_lines: int = 100) -> str:
+ async def get_webpage_text(self, page: Page, n_lines: int = 50) -> str:
"""
- page: playwright page object
- n_lines: number of lines to return from the page innertext
- return: text in the first n_lines of the page
+ Retrieve the text content of the web page.
+
+ Args:
+ page (Page): The Playwright page object.
+ n_lines (int): The number of lines to return from the page inner text.
+
+ Returns:
+ str: The text content of the page.
"""
assert page is not None
try:
@@ -375,6 +528,22 @@ class PlaywrightController:
return ""
async def get_page_markdown(self, page: Page) -> str:
- # TODO: replace with mdconvert
+ """
+ Retrieve the markdown content of the web page.
+ Currently not implemented.
+
+ Args:
+ page (Page): The Playwright page object.
+
+ Returns:
+ str: The markdown content of the page.
+ """
assert page is not None
- return await self.get_webpage_text(page, n_lines=1000)
+ if self._markdown_converter is None and MarkItDown is not None:
+ self._markdown_converter = MarkItDown()
+ html = await page.evaluate("document.documentElement.outerHTML;")
+ res = self._markdown_converter.convert_stream(io.StringIO(html), file_extension=".html", url=page.url) # type: ignore
+ assert hasattr(res, "text_content") and isinstance(res.text_content, str)
+ return res.text_content
+ else:
+ return await self.get_webpage_text(page, n_lines=200)
diff --git a/python/packages/autogen-ext/tests/test_playwright_controller.py b/python/packages/autogen-ext/tests/test_playwright_controller.py
new file mode 100644
index 000000000..177f0b561
--- /dev/null
+++ b/python/packages/autogen-ext/tests/test_playwright_controller.py
@@ -0,0 +1,78 @@
+import pytest
+from autogen_ext.agents.web_surfer.playwright_controller import PlaywrightController
+from playwright.async_api import async_playwright
+
+FAKE_HTML = """
+
+
+
+
+
+ Fake Page
+
+
+