Pass TestSearch benchmark consistently (Add browse_website TOKENS_TO_TRIGGER_SUMMARY) (#5092)

* Added SUMMARIZATION_TRIGGER_LENGTH
browse_website won't summarize content that's shorter
than SUMMARIZATION_TRIGGER_LENGTH.
It defaults to 250 characters, which is approximately 50 tokens.

* Refactor BrowserOptions

* Use tokens instead of length
to trigger summarization

* Bugfix

* fix: Always return links even if not summarizing
feat: Increase the number of links returned from 5 to 20

---------

Co-authored-by: lc0rp <2609411+lc0rp@users.noreply.github.com>
Co-authored-by: James Collins <collijk@uw.edu>
This commit is contained in:
Luke
2023-08-01 14:48:13 -04:00
committed by GitHub
parent a593c32727
commit 3a2d08fb41

View File

@@ -2,13 +2,15 @@
from __future__ import annotations
from autogpt.llm.utils.token_counter import count_string_tokens
COMMAND_CATEGORY = "web_browse"
COMMAND_CATEGORY_TITLE = "Web Browsing"
import logging
from pathlib import Path
from sys import platform
from typing import Optional, Type
from typing import Optional
from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
@@ -16,6 +18,7 @@ from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeDriverService
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.options import ArgOptions as BrowserOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.edge.service import Service as EdgeDriverService
from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
@@ -38,9 +41,9 @@ from autogpt.memory.vector import MemoryItem, get_memory
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
from autogpt.url_utils.validators import validate_url
BrowserOptions = ChromeOptions | EdgeOptions | FirefoxOptions | SafariOptions
FILE_DIR = Path(__file__).parent.parent
TOKENS_TO_TRIGGER_SUMMARY = 50
LINKS_TO_RETURN = 20
@command(
@@ -64,25 +67,30 @@ def browse_website(url: str, question: str, agent: Agent) -> str:
question (str): The question asked by the user
Returns:
Tuple[str, WebDriver]: The answer and links to the user and the webdriver
str: The answer and links to the user and the webdriver
"""
driver = None
try:
driver, text = scrape_text_with_selenium(url, agent)
add_header(driver)
if TOKENS_TO_TRIGGER_SUMMARY < count_string_tokens(text, agent.llm.name):
text = summarize_memorize_webpage(url, text, question, agent, driver)
links = scrape_links_with_selenium(driver, url)
# Limit links to LINKS_TO_RETURN
if len(links) > LINKS_TO_RETURN:
links = links[:LINKS_TO_RETURN]
return f"Answer gathered from website: {text}\n\nLinks: {links}"
except WebDriverException as e:
# These errors are often quite long and include lots of context.
# Just grab the first line.
msg = e.msg.split("\n")[0]
return f"Error: {msg}"
add_header(driver)
summary = summarize_memorize_webpage(url, text, question, agent, driver)
links = scrape_links_with_selenium(driver, url)
# Limit links to 5
if len(links) > 5:
links = links[:5]
close_browser(driver)
return f"Answer gathered from website: {summary}\n\nLinks: {links}"
finally:
if driver:
close_browser(driver)
def scrape_text_with_selenium(url: str, agent: Agent) -> tuple[WebDriver, str]:
@@ -96,7 +104,7 @@ def scrape_text_with_selenium(url: str, agent: Agent) -> tuple[WebDriver, str]:
"""
logging.getLogger("selenium").setLevel(logging.CRITICAL)
options_available: dict[str, Type[BrowserOptions]] = {
options_available: dict[str, BrowserOptions] = {
"chrome": ChromeOptions,
"edge": EdgeOptions,
"firefox": FirefoxOptions,