WebSurfer: print viewport text (#5329)

This PR adds a method that approximately extracts the text visible in
the viewport of the web browser (as opposed to always printing the first
50 lines, or relying entirely on OCR).
This commit is contained in:
afourney
2025-02-03 08:42:18 -08:00
committed by GitHub
parent 227b875f28
commit 877796ded1
3 changed files with 74 additions and 4 deletions

View File

@@ -264,8 +264,6 @@ class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
TOOL_SLEEP,
TOOL_HOVER,
]
# Number of lines of text to extract from the page in the absence of OCR
self.n_lines_page_text = 50
self.did_lazy_init = False # flag to check if we have initialized the browser
async def _lazy_init(
@@ -743,7 +741,7 @@ class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
ocr_text = (
await self._get_ocr_text(new_screenshot, cancellation_token=cancellation_token)
if self.use_ocr is True
else await self._playwright_controller.get_webpage_text(self._page, n_lines=self.n_lines_page_text)
else await self._playwright_controller.get_visible_text(self._page)
)
# Return the complete observation
@@ -752,7 +750,7 @@ class MultimodalWebSurfer(BaseChatAgent, Component[MultimodalWebSurferConfig]):
if self.use_ocr:
message_content += f"Automatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}"
else:
message_content += f"The first {self.n_lines_page_text} lines of the page text is:\n\n{ocr_text}"
message_content += f"The following text is visible in the viewport:\n\n{ocr_text}"
return [
message_content,

View File

@@ -367,10 +367,63 @@ var MultimodalWebSurfer = MultimodalWebSurfer || (function() {
return results;
};
let getVisibleText = function() {
// Get the windows current viewport boundaries
const viewportHeight = window.innerHeight || document.documentElement.clientHeight;
const viewportWidth = window.innerWidth || document.documentElement.clientWidth;
let textInView = "";
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null,
false
);
while (walker.nextNode()) {
const textNode = walker.currentNode;
// Create a range to retrieve bounding rectangles of the current text node
const range = document.createRange();
range.selectNodeContents(textNode);
const rects = range.getClientRects();
// Check if any rect is inside (or partially inside) the viewport
for (const rect of rects) {
const isVisible =
rect.width > 0 &&
rect.height > 0 &&
rect.bottom >= 0 &&
rect.right >= 0 &&
rect.top <= viewportHeight &&
rect.left <= viewportWidth;
if (isVisible) {
textInView += textNode.nodeValue.replace(/\s+/g, " ");
// Is the parent a block element?
if (textNode.parentNode) {
const parent = textNode.parentNode;
const style = window.getComputedStyle(parent);
if (["inline", "hidden", "none"].indexOf(style.display) === -1) {
textInView += "\n";
}
}
break; // No need to check other rects once found visible
}
}
}
// Remove blank lines from textInView
textInView = textInView.replace(/^\s*\n/gm, "").trim().replace(/\n+/g, "\n");
return textInView;
};
return {
getInteractiveRects: getInteractiveRects,
getVisualViewport: getVisualViewport,
getFocusedElementId: getFocusedElementId,
getPageMetadata: getPageMetadata,
getVisibleText: getVisibleText,
};
})();

View File

@@ -527,6 +527,25 @@ class PlaywrightController:
except Exception:
return ""
async def get_visible_text(self, page: Page) -> str:
"""
Retrieve the text content of the browser viewport (approximately).
Args:
page (Page): The Playwright page object.
Returns:
str: The text content of the page.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
except Exception:
pass
result = await page.evaluate("MultimodalWebSurfer.getVisibleText();")
assert isinstance(result, str)
return result
async def get_page_markdown(self, page: Page) -> str:
"""
Retrieve the markdown content of the web page.