Added token counting to TeamOne (#264)

* Added token counting to TeamOne
2026-02-11 09:25:46 -05:00 · 2024-07-26 09:45:41 -07:00
parent 25b9a91328
commit 61dec0a3ca
3 changed files with 70 additions and 37 deletions
--- a/python/teams/team-one/examples/example_websurfer.py
+++ b/python/teams/team-one/examples/example_websurfer.py
@@ -35,9 +35,13 @@ async def main() -> None:

    run_context = runtime.start()

-    actual_surfer = runtime._get_agent(web_surfer.id)  # type: ignore
-    assert isinstance(actual_surfer, MultimodalWebSurfer)
-    await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium")
+    actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer)
+    await actual_surfer.init(
+        model_client=client,
+        downloads_folder=os.getcwd(),
+        start_page="https://www.adamfourney.com",
+        browser_channel="chromium",
+    )

    await runtime.send_message(RequestReplyMessage(), user_proxy.id)
    await run_context.stop_when_idle()
--- a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
+++ b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
@@ -70,6 +70,8 @@ VIEWPORT_WIDTH = 1440
 MLM_HEIGHT = 765
 MLM_WIDTH = 1224

+SCREENSHOT_TOKENS = 1105
+
 logger = logging.getLogger(EVENT_LOGGER_NAME + ".MultimodalWebSurfer")


@@ -718,20 +720,6 @@ When deciding between tools, consider if the request can be best addressed by:

        page_markdown: str = await self._get_page_markdown()

-        # TODO: Get token count working
-
-        buffer = page_markdown
-        # buffer: str = ""
-        # for line in re.split(r"([\r\n]+)", page_markdown):
-        #    tokens = count_token(buffer + line)
-        #    if tokens + 1024 > token_limit:  # Leave room for our summary
-        #        break
-        #    buffer += line
-
-        buffer = buffer.strip()
-        if len(buffer) == 0:
-            return "Nothing to summarize."
-
        title: str = self._page.url
        try:
            title = await self._page.title()
@@ -742,29 +730,55 @@ When deciding between tools, consider if the request can be best addressed by:
        screenshot = Image.open(io.BytesIO(await self._page.screenshot()))
        scaled_screenshot = screenshot.resize((MLM_WIDTH, MLM_HEIGHT))
        screenshot.close()
+        ag_image = AGImage.from_pil(scaled_screenshot)

-        prompt = f"We are visiting the webpage '{title}'. Its full-text contents are pasted below, along with a screenshot of the page's current viewport."
-        if question is not None:
-            prompt += (
-                f" Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n{buffer}"
-            )
-        else:
-            prompt += f" Please summarize the webpage into one or two paragraphs:\n\n{buffer}"
-
-        # Add the multimodal message and make the request
+        # Prepare the system prompt
        messages: List[LLMMessage] = []
        messages.append(
            SystemMessage(content="You are a helpful assistant that can summarize long documents to answer question.")
        )
+
+        # Prepare the main prompt
+        prompt = f"We are visiting the webpage '{title}'. Its full-text content are pasted below, along with a screenshot of the page's current viewport."
+        if question is not None:
+            prompt += f" Please summarize the webpage into one or two paragraphs with respect to '{question}':\n\n"
+        else:
+            prompt += " Please summarize the webpage into one or two paragraphs:\n\n"
+
+        # Grow the buffer (which is added to the prompt) until we overflow the context window or run out of lines
+        buffer = ""
+        for line in re.split(r"([\r\n]+)", page_markdown):
+            message = UserMessage(
+                # content=[
+                prompt + buffer + line,
+                #    ag_image,
+                # ],
+                source=self.metadata["name"],
+            )
+
+            remaining = self._model_client.remaining_tokens(messages + [message])
+            if remaining > SCREENSHOT_TOKENS:
+                buffer += line
+            else:
+                break
+
+        # Nothing to do
+        buffer = buffer.strip()
+        if len(buffer) == 0:
+            return "Nothing to summarize."
+
+        # Append the message
        messages.append(
            UserMessage(
                content=[
-                    prompt,
-                    AGImage.from_pil(scaled_screenshot),
+                    prompt + buffer,
+                    ag_image,
                ],
                source=self.metadata["name"],
            )
        )
+
+        # Generate the response
        response = await self._model_client.create(messages)
        scaled_screenshot.close()
        assert isinstance(response.content, str)