From 61dec0a3ca5fdcf52541aa497dfc7ee59fb85fa0 Mon Sep 17 00:00:00 2001
From: afourney <adam.fourney@gmail.com>
Date: Fri, 26 Jul 2024 09:45:41 -0700
Subject: [PATCH] Added token counting to TeamOne (#264)

* Added token counting to TeamOne
---
 .../GAIA/Templates/TeamOne/scenario.py        | 33 +++++++---
 .../team-one/examples/example_websurfer.py    | 10 ++-
 .../multimodal_web_surfer.py                  | 64 +++++++++++--------
 3 files changed, 70 insertions(+), 37 deletions(-)

diff --git a/python/benchmarks/GAIA/Templates/TeamOne/scenario.py b/python/benchmarks/GAIA/Templates/TeamOne/scenario.py
index 2a2d5a17a..f654c3f27 100644
--- a/python/benchmarks/GAIA/Templates/TeamOne/scenario.py
+++ b/python/benchmarks/GAIA/Templates/TeamOne/scenario.py
@@ -23,19 +23,22 @@ from team_one.agents.file_surfer import FileSurfer
 from team_one.utils import LogHandler, message_content_to_str
 
 import re
+import tiktoken
 
 from agnext.components.models import AssistantMessage
 
+encoding = None 
+def count_token(value: str) -> int:
+    # TODO:: Migrate to model_client.count_tokens
+    global encoding
+    if encoding is None:
+        encoding = tiktoken.encoding_for_model("gpt-4o-2024-05-13")
+    return len(encoding.encode(value))
 
 async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]) -> str:
-    messages: List[LLMMessage] = [
-        UserMessage(
-            content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:",
-            source=source,
-        )
-    ]
+    messages: List[LLMMessage] = []
 
-    # copy them to this context
+    # copy them to this context 
     for message in transcript:
         messages.append(
             UserMessage(
@@ -45,6 +48,18 @@ async def response_preparer(task: str, source: str, client: ChatCompletionClient
             )
         )
 
+    # Remove messages until we are within 2k of the context window limit
+    while len(messages) and client.remaining_tokens( messages ) < 2000:
+        messages.pop(0)
+
+    # Add the preamble
+    messages.insert(0,
+        UserMessage(
+            content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:",
+            source=source,
+        )
+    )
+
     # ask for the final answer
     messages.append(
         UserMessage(
@@ -164,8 +179,8 @@ async def main() -> None:
             mdconverter = MarkdownConverter()
             res = mdconverter.convert(filename)
             if res.text_content:
-                #if count_token(res.text_content) < 8000:  # Don't put overly-large documents into the prompt
-                filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content
+                if count_token(res.text_content) < 8000:  # Don't put overly-large documents into the prompt
+                    filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content
         except UnsupportedFormatException:
             pass
 
diff --git a/python/teams/team-one/examples/example_websurfer.py b/python/teams/team-one/examples/example_websurfer.py
index da75ece65..26c154b95 100644
--- a/python/teams/team-one/examples/example_websurfer.py
+++ b/python/teams/team-one/examples/example_websurfer.py
@@ -35,9 +35,13 @@ async def main() -> None:
 
     run_context = runtime.start()
 
-    actual_surfer = runtime._get_agent(web_surfer.id)  # type: ignore
-    assert isinstance(actual_surfer, MultimodalWebSurfer)
-    await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium")
+    actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer)
+    await actual_surfer.init(
+        model_client=client,
+        downloads_folder=os.getcwd(),
+        start_page="https://www.adamfourney.com",
+        browser_channel="chromium",
+    )
 
     await runtime.send_message(RequestReplyMessage(), user_proxy.id)
     await run_context.stop_when_idle()
diff --git a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
index 60c6ceac8..fa60d7192 100644
--- a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
+++ b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
@@ -70,6 +70,8 @@ VIEWPORT_WIDTH = 1440
 MLM_HEIGHT = 765
 MLM_WIDTH = 1224
 
+SCREENSHOT_TOKENS = 1105
+
 logger = logging.getLogger(EVENT_LOGGER_NAME + ".MultimodalWebSurfer")
 
 
@@ -718,20 +720,6 @@ When deciding between tools, consider if the request can be best addressed by:
 
         page_markdown: str = await self._get_page_markdown()
 
-        # TODO: Get token count working
-
-        buffer = page_markdown
-        # buffer: str = ""
-        # for line in re.split(r"([\r\n]+)", page_markdown):
-        #    tokens = count_token(buffer + line)
-        #    if tokens + 1024 > token_limit:  # Leave room for our summary
-        #        break
-        #    buffer += line
-
-        buffer = buffer.strip()
-        if len(buffer) == 0:
-            return "Nothing to summarize."
-
         title: str = self._page.url
         try:
             title = await self._page.title()
@@ -742,29 +730,55 @@ When deciding between tools, consider if the request can be best addressed by:
         screenshot = Image.open(io.BytesIO(await self._page.screenshot()))
         scaled_screenshot = screenshot.resize((MLM_WIDTH, MLM_HEIGHT))
         screenshot.close()
+        ag_image = AGImage.from_pil(scaled_screenshot)
 
-        prompt = f"We are visiting the webpage '{title}'. Its full-text contents are pasted below, along with a screenshot of the page's current viewport."
-        if question is not None:
-            prompt += (
-                f" Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n{buffer}"
-            )
-        else:
-            prompt += f" Please summarize the webpage into one or two paragraphs:\n\n{buffer}"
-
-        # Add the multimodal message and make the request
+        # Prepare the system prompt
         messages: List[LLMMessage] = []
         messages.append(
             SystemMessage(content="You are a helpful assistant that can summarize long documents to answer question.")
         )
+
+        # Prepare the main prompt
+        prompt = f"We are visiting the webpage '{title}'. Its full-text content are pasted below, along with a screenshot of the page's current viewport."
+        if question is not None:
+            prompt += f" Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n"
+        else:
+            prompt += " Please summarize the webpage into one or two paragraphs:\n\n"
+
+        # Grow the buffer (which is added to the prompt) until we overflow the context window or run out of lines
+        buffer = ""
+        for line in re.split(r"([\r\n]+)", page_markdown):
+            message = UserMessage(
+                # content=[
+                prompt + buffer + line,
+                #    ag_image,
+                # ],
+                source=self.metadata["name"],
+            )
+
+            remaining = self._model_client.remaining_tokens(messages + [message])
+            if remaining > SCREENSHOT_TOKENS:
+                buffer += line
+            else:
+                break
+
+        # Nothing to do
+        buffer = buffer.strip()
+        if len(buffer) == 0:
+            return "Nothing to summarize."
+
+        # Append the message
         messages.append(
             UserMessage(
                 content=[
-                    prompt,
-                    AGImage.from_pil(scaled_screenshot),
+                    prompt + buffer,
+                    ag_image,
                 ],
                 source=self.metadata["name"],
             )
         )
+
+        # Generate the response
         response = await self._model_client.create(messages)
         scaled_screenshot.close()
         assert isinstance(response.content, str)