mirror of
https://github.com/microsoft/autogen.git
synced 2026-04-20 03:02:16 -04:00
WebSurfer: print viewport text (#5329)
This PR adds a method that approximately extracts the text visible in the viewport of the web browser (as opposed to always printing the first 50 lines, or relying entirely on OCR).
This commit is contained in:
@@ -264,8 +264,6 @@ class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
|
||||
TOOL_SLEEP,
|
||||
TOOL_HOVER,
|
||||
]
|
||||
# Number of lines of text to extract from the page in the absence of OCR
|
||||
self.n_lines_page_text = 50
|
||||
self.did_lazy_init = False # flag to check if we have initialized the browser
|
||||
|
||||
async def _lazy_init(
|
||||
@@ -743,7 +741,7 @@ class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
|
||||
ocr_text = (
|
||||
await self._get_ocr_text(new_screenshot, cancellation_token=cancellation_token)
|
||||
if self.use_ocr is True
|
||||
else await self._playwright_controller.get_webpage_text(self._page, n_lines=self.n_lines_page_text)
|
||||
else await self._playwright_controller.get_visible_text(self._page)
|
||||
)
|
||||
|
||||
# Return the complete observation
|
||||
@@ -752,7 +750,7 @@ class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
|
||||
if self.use_ocr:
|
||||
message_content += f"Automatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}"
|
||||
else:
|
||||
message_content += f"The first {self.n_lines_page_text} lines of the page text is:\n\n{ocr_text}"
|
||||
message_content += f"The following text is visible in the viewport:\n\n{ocr_text}"
|
||||
|
||||
return [
|
||||
message_content,
|
||||
|
||||
@@ -367,10 +367,63 @@ var MultimodalWebSurfer = MultimodalWebSurfer || (function() {
|
||||
return results;
|
||||
};
|
||||
|
||||
|
||||
let getVisibleText = function() {
|
||||
// Get the window’s current viewport boundaries
|
||||
const viewportHeight = window.innerHeight || document.documentElement.clientHeight;
|
||||
const viewportWidth = window.innerWidth || document.documentElement.clientWidth;
|
||||
|
||||
let textInView = "";
|
||||
const walker = document.createTreeWalker(
|
||||
document.body,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
null,
|
||||
false
|
||||
);
|
||||
|
||||
while (walker.nextNode()) {
|
||||
const textNode = walker.currentNode;
|
||||
// Create a range to retrieve bounding rectangles of the current text node
|
||||
const range = document.createRange();
|
||||
range.selectNodeContents(textNode);
|
||||
|
||||
const rects = range.getClientRects();
|
||||
|
||||
// Check if any rect is inside (or partially inside) the viewport
|
||||
for (const rect of rects) {
|
||||
const isVisible =
|
||||
rect.width > 0 &&
|
||||
rect.height > 0 &&
|
||||
rect.bottom >= 0 &&
|
||||
rect.right >= 0 &&
|
||||
rect.top <= viewportHeight &&
|
||||
rect.left <= viewportWidth;
|
||||
|
||||
if (isVisible) {
|
||||
textInView += textNode.nodeValue.replace(/\s+/g, " ");
|
||||
// Is the parent a block element?
|
||||
if (textNode.parentNode) {
|
||||
const parent = textNode.parentNode;
|
||||
const style = window.getComputedStyle(parent);
|
||||
if (["inline", "hidden", "none"].indexOf(style.display) === -1) {
|
||||
textInView += "\n";
|
||||
}
|
||||
}
|
||||
break; // No need to check other rects once found visible
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove blank lines from textInView
|
||||
textInView = textInView.replace(/^\s*\n/gm, "").trim().replace(/\n+/g, "\n");
|
||||
return textInView;
|
||||
};
|
||||
|
||||
return {
|
||||
getInteractiveRects: getInteractiveRects,
|
||||
getVisualViewport: getVisualViewport,
|
||||
getFocusedElementId: getFocusedElementId,
|
||||
getPageMetadata: getPageMetadata,
|
||||
getVisibleText: getVisibleText,
|
||||
};
|
||||
})();
|
||||
|
||||
@@ -527,6 +527,25 @@ class PlaywrightController:
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
async def get_visible_text(self, page: Page) -> str:
|
||||
"""
|
||||
Retrieve the text content of the browser viewport (approximately).
|
||||
|
||||
Args:
|
||||
page (Page): The Playwright page object.
|
||||
|
||||
Returns:
|
||||
str: The text content of the page.
|
||||
"""
|
||||
assert page is not None
|
||||
try:
|
||||
await page.evaluate(self._page_script)
|
||||
except Exception:
|
||||
pass
|
||||
result = await page.evaluate("MultimodalWebSurfer.getVisibleText();")
|
||||
assert isinstance(result, str)
|
||||
return result
|
||||
|
||||
async def get_page_markdown(self, page: Page) -> str:
|
||||
"""
|
||||
Retrieve the markdown content of the web page.
|
||||
|
||||
Reference in New Issue
Block a user