From 61dec0a3ca5fdcf52541aa497dfc7ee59fb85fa0 Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 26 Jul 2024 09:45:41 -0700 Subject: [PATCH] Added token counting to TeamOne (#264) * Added token counting to TeamOne --- .../GAIA/Templates/TeamOne/scenario.py | 33 +++++++--- .../team-one/examples/example_websurfer.py | 10 ++- .../multimodal_web_surfer.py | 64 +++++++++++-------- 3 files changed, 70 insertions(+), 37 deletions(-) diff --git a/python/benchmarks/GAIA/Templates/TeamOne/scenario.py b/python/benchmarks/GAIA/Templates/TeamOne/scenario.py index 2a2d5a17a..f654c3f27 100644 --- a/python/benchmarks/GAIA/Templates/TeamOne/scenario.py +++ b/python/benchmarks/GAIA/Templates/TeamOne/scenario.py @@ -23,19 +23,22 @@ from team_one.agents.file_surfer import FileSurfer from team_one.utils import LogHandler, message_content_to_str import re +import tiktoken from agnext.components.models import AssistantMessage +encoding = None +def count_token(value: str) -> int: + # TODO:: Migrate to model_client.count_tokens + global encoding + if encoding is None: + encoding = tiktoken.encoding_for_model("gpt-4o-2024-05-13") + return len(encoding.encode(value)) async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]) -> str: - messages: List[LLMMessage] = [ - UserMessage( - content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:", - source=source, - ) - ] + messages: List[LLMMessage] = [] - # copy them to this context + # copy them to this context for message in transcript: messages.append( UserMessage( @@ -45,6 +48,18 @@ async def response_preparer(task: str, source: str, client: ChatCompletionClient ) ) + # Remove messages until we are within 2k of the context window limit + while len(messages) and client.remaining_tokens( messages ) < 2000: + messages.pop(0) + + # Add the preamble + messages.insert(0, + UserMessage( + content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:", + source=source, + ) + ) + # ask for the final answer messages.append( UserMessage( @@ -164,8 +179,8 @@ async def main() -> None: mdconverter = MarkdownConverter() res = mdconverter.convert(filename) if res.text_content: - #if count_token(res.text_content) < 8000: # Don't put overly-large documents into the prompt - filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content + if count_token(res.text_content) < 8000: # Don't put overly-large documents into the prompt + filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content except UnsupportedFormatException: pass diff --git a/python/teams/team-one/examples/example_websurfer.py b/python/teams/team-one/examples/example_websurfer.py index da75ece65..26c154b95 100644 --- a/python/teams/team-one/examples/example_websurfer.py +++ b/python/teams/team-one/examples/example_websurfer.py @@ -35,9 +35,13 @@ async def main() -> None: run_context = runtime.start() - actual_surfer = runtime._get_agent(web_surfer.id) # type: ignore - assert isinstance(actual_surfer, MultimodalWebSurfer) - await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium") + actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer) + await actual_surfer.init( + model_client=client, + downloads_folder=os.getcwd(), + start_page="https://www.adamfourney.com", + browser_channel="chromium", + ) await runtime.send_message(RequestReplyMessage(), user_proxy.id) await run_context.stop_when_idle() diff --git a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py index 60c6ceac8..fa60d7192 100644 --- a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py +++ b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py @@ -70,6 +70,8 @@ VIEWPORT_WIDTH = 1440 MLM_HEIGHT = 765 MLM_WIDTH = 1224 +SCREENSHOT_TOKENS = 1105 + logger = logging.getLogger(EVENT_LOGGER_NAME + ".MultimodalWebSurfer") @@ -718,20 +720,6 @@ When deciding between tools, consider if the request can be best addressed by: page_markdown: str = await self._get_page_markdown() - # TODO: Get token count working - - buffer = page_markdown - # buffer: str = "" - # for line in re.split(r"([\r\n]+)", page_markdown): - # tokens = count_token(buffer + line) - # if tokens + 1024 > token_limit: # Leave room for our summary - # break - # buffer += line - - buffer = buffer.strip() - if len(buffer) == 0: - return "Nothing to summarize." - title: str = self._page.url try: title = await self._page.title() @@ -742,29 +730,55 @@ When deciding between tools, consider if the request can be best addressed by: screenshot = Image.open(io.BytesIO(await self._page.screenshot())) scaled_screenshot = screenshot.resize((MLM_WIDTH, MLM_HEIGHT)) screenshot.close() + ag_image = AGImage.from_pil(scaled_screenshot) - prompt = f"We are visiting the webpage '{title}'. Its full-text contents are pasted below, along with a screenshot of the page's current viewport." - if question is not None: - prompt += ( - f" Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n{buffer}" - ) - else: - prompt += f" Please summarize the webpage into one or two paragraphs:\n\n{buffer}" - - # Add the multimodal message and make the request + # Prepare the system prompt messages: List[LLMMessage] = [] messages.append( SystemMessage(content="You are a helpful assistant that can summarize long documents to answer question.") ) + + # Prepare the main prompt + prompt = f"We are visiting the webpage '{title}'. Its full-text content are pasted below, along with a screenshot of the page's current viewport." + if question is not None: + prompt += f" Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n" + else: + prompt += " Please summarize the webpage into one or two paragraphs:\n\n" + + # Grow the buffer (which is added to the prompt) until we overflow the context window or run out of lines + buffer = "" + for line in re.split(r"([\r\n]+)", page_markdown): + message = UserMessage( + # content=[ + prompt + buffer + line, + # ag_image, + # ], + source=self.metadata["name"], + ) + + remaining = self._model_client.remaining_tokens(messages + [message]) + if remaining > SCREENSHOT_TOKENS: + buffer += line + else: + break + + # Nothing to do + buffer = buffer.strip() + if len(buffer) == 0: + return "Nothing to summarize." + + # Append the message messages.append( UserMessage( content=[ - prompt, - AGImage.from_pil(scaled_screenshot), + prompt + buffer, + ag_image, ], source=self.metadata["name"], ) ) + + # Generate the response response = await self._model_client.create(messages) scaled_screenshot.close() assert isinstance(response.content, str)