From 211bfa01c369e77bef61fcfefceff4a0243ba113 Mon Sep 17 00:00:00 2001 From: afourney Date: Wed, 17 Jul 2024 09:51:19 -0700 Subject: [PATCH] TeamOne implementation of GAIA (#221) Port of GAIA benchmark --- python/benchmarks/GAIA/ENV.json.sample | 7 + .../GAIA/Scripts/custom_tabulate.py | 197 +++++++++++++ python/benchmarks/GAIA/Scripts/init_tasks.py | 157 ++++++++++ .../Templates/TeamOne/expected_answer.txt | 1 + .../GAIA/Templates/TeamOne/prompt.txt | 1 + .../GAIA/Templates/TeamOne/requirements.txt | 1 + .../GAIA/Templates/TeamOne/scenario.py | 197 +++++++++++++ .../team-one/examples/example_websurfer.py | 3 +- python/teams/team-one/pyproject.toml | 2 +- .../src/team_one/agents/base_agent.py | 10 +- .../team-one/src/team_one/agents/coder.py | 18 +- .../multimodal_web_surfer.py | 279 ++++++++++-------- .../multimodal_web_surfer/tool_definitions.py | 2 +- .../src/team_one/agents/orchestrator.py | 6 +- .../teams/team-one/src/team_one/messages.py | 16 +- python/teams/team-one/src/team_one/utils.py | 32 +- .../tools/agbench/src/agbench/res/Dockerfile | 5 +- 17 files changed, 790 insertions(+), 144 deletions(-) create mode 100644 python/benchmarks/GAIA/ENV.json.sample create mode 100644 python/benchmarks/GAIA/Scripts/custom_tabulate.py create mode 100644 python/benchmarks/GAIA/Scripts/init_tasks.py create mode 100644 python/benchmarks/GAIA/Templates/TeamOne/expected_answer.txt create mode 100644 python/benchmarks/GAIA/Templates/TeamOne/prompt.txt create mode 100644 python/benchmarks/GAIA/Templates/TeamOne/requirements.txt create mode 100644 python/benchmarks/GAIA/Templates/TeamOne/scenario.py diff --git a/python/benchmarks/GAIA/ENV.json.sample b/python/benchmarks/GAIA/ENV.json.sample new file mode 100644 index 000000000..2c26d1826 --- /dev/null +++ b/python/benchmarks/GAIA/ENV.json.sample @@ -0,0 +1,7 @@ +{ + "CHAT_COMPLETION_PROVIDER": "azure", + "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-05-01-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT_HERE\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}", + "BING_API_KEY": "YOUR_KEY_KEY", + "HOMEPAGE": "https://www.bing.com/", + "WEB_SURFER_DEBUG_DIR": "/autogen/debug" +} diff --git a/python/benchmarks/GAIA/Scripts/custom_tabulate.py b/python/benchmarks/GAIA/Scripts/custom_tabulate.py new file mode 100644 index 000000000..ec51863e9 --- /dev/null +++ b/python/benchmarks/GAIA/Scripts/custom_tabulate.py @@ -0,0 +1,197 @@ +import os +import sys +import re +from agbench.tabulate_cmd import default_tabulate +import json +import pandas as pd +import sqlite3 +import glob +import numpy as np + +EXCLUDE_DIR_NAMES = ["__pycache__"] + + +def normalize_answer(a): + # Lower case + # Trim (left and right) + # standardize comma separated values + # Replace multiple spaces with one space + # Remove trailing punctuation + norm_answer = ", ".join(a.strip().lower().split(",")) + norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer)) + return norm_answer + + +def scorer(instance_dir): + # Read the expected answer + expected_answer_file = os.path.join(instance_dir, "expected_answer.txt") + if not os.path.isfile(expected_answer_file): + return None + + expected_answer = None + with open(expected_answer_file, "rt") as fh: + expected_answer = fh.read().strip() + + # Read the console + console_log_file = os.path.join(instance_dir, "console_log.txt") + if not os.path.isfile(console_log_file): + return None + + console_log = "" + with open(console_log_file, "rt") as fh: + console_log = fh.read() + + final_answer = None + m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL) + if m: + final_answer = m.group(1).strip() + + # Missing the final answer line + if final_answer is None: + return None + + # Return true if they are equal after normalization + n_ex = normalize_answer(expected_answer) + n_final = normalize_answer(final_answer) + return ( + (n_ex != "" and n_ex == n_final), + n_ex, + n_final + ) + + +def get_number_of_chat_messages(chat_messages_dir): + result = 0 + for file in glob.glob(f"{chat_messages_dir}/*_messages.json"): + with open(file, "r") as f: + content = json.load(f) + for agent, messages in content.items(): + result += len(messages) + return result + + +def main(args): + parsed_args, all_results = default_tabulate(args, scorer=scorer) + excel_path = parsed_args.excel + + if excel_path: + excel_dir = os.path.dirname(excel_path) or "." + if not os.path.exists(excel_dir): + os.makedirs(excel_dir, exist_ok=True) + + if not excel_path.endswith((".xlsx", ".xls")): + excel_path += ".xlsx" + + runlogs = parsed_args.runlogs if parsed_args.runlogs.endswith("/") else parsed_args.runlogs + "/" + + if os.path.isdir(runlogs): + task_ids = sorted( + [task_id for task_id in os.listdir(runlogs) if task_id not in EXCLUDE_DIR_NAMES], + key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)), + ) + else: + raise ValueError("please input a valid directory to tabulate result") + + trials = sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) if len(task_ids) > 0 else [] + dbnames = [[f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids] for trial in trials] + + query = """ + SELECT cost, session_id, response, start_time, end_time + FROM ( + SELECT invocation_id, cost, session_id, response, start_time, end_time, + ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn + FROM chat_completions + ) + WHERE rn = 1; + """ + + with pd.ExcelWriter(excel_path, engine="openpyxl") as writer: + for trial_index, each_trial in enumerate(dbnames): + result_df = pd.DataFrame( + columns=[ + "id", + "status", + "expected_answer", + "final_answer", + "cost", + "latency", + "num_of_llm_requests", + "num_of_chat_messages", + "prompt_tokens", + "completion_tokens", + "total_tokens", + "model", + ] + ) + + result_df_type_mapping = { + "id": str, + "status": bool, + "expected_answer": str, + "final_answer": str, + "cost": float, + "latency": float, + "num_of_llm_requests": int, + "num_of_chat_messages": int, + "prompt_tokens": int, + "completion_tokens": int, + "total_tokens": int, + } + + for dbname, scorer_results in zip(each_trial, all_results): + task_id = scorer_results[0] + scorer_result = scorer_results[trial_index + 1] + + status, expected_answer, final_answer = scorer_result if scorer_result else (False,"","") + + con = sqlite3.connect(dbname) + + # TODO: if large amount of data, add chunksize + telemetry_df = pd.read_sql_query(query, con) + + earliest_starttime = pd.to_datetime(telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f").min() + latest_endtime = pd.to_datetime(telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f").max() + + num_of_chat_messages = get_number_of_chat_messages(chat_messages_dir=os.path.dirname(dbname)) + result = { + "id": task_id, + "status": status, + "expected_answer": expected_answer, + "final_answer": final_answer, + "cost": telemetry_df["cost"].sum(), + "latency": (latest_endtime - earliest_starttime).total_seconds(), + "num_of_llm_requests": len(telemetry_df), + "num_of_chat_messages": num_of_chat_messages, + "prompt_tokens": telemetry_df["response"] + .apply( + lambda x: json.loads(x)["usage"]["prompt_tokens"] + if "usage" in json.loads(x) and "prompt_tokens" in json.loads(x)["usage"] + else 0 + ) + .sum(), + "completion_tokens": telemetry_df["response"] + .apply( + lambda x: json.loads(x)["usage"]["completion_tokens"] + if "usage" in json.loads(x) and "completion_tokens" in json.loads(x)["usage"] + else 0 + ) + .sum(), + "total_tokens": telemetry_df["response"] + .apply( + lambda x: json.loads(x)["usage"]["total_tokens"] + if "usage" in json.loads(x) and "total_tokens" in json.loads(x)["usage"] + else 0 + ) + .sum(), + "model": telemetry_df["response"] + .apply(lambda x: json.loads(x)["model"] if "model" in json.loads(x) else "") + .unique(), + } + + result_df = result_df.astype(result_df_type_mapping) + result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True) + result_df.to_excel(writer, sheet_name=f"trial_{trial_index}", index=False) + + +if __name__ == "__main__" and __package__ is None: + main(sys.argv) diff --git a/python/benchmarks/GAIA/Scripts/init_tasks.py b/python/benchmarks/GAIA/Scripts/init_tasks.py new file mode 100644 index 000000000..3db67a37f --- /dev/null +++ b/python/benchmarks/GAIA/Scripts/init_tasks.py @@ -0,0 +1,157 @@ +# +# Run this file to download the human_eval dataset, and create a corresponding testbed scenario: +# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl) +# + +import json +import os +import re +import sys + +from huggingface_hub import snapshot_download + +SCRIPT_PATH = os.path.realpath(__file__) +SCRIPT_NAME = os.path.basename(SCRIPT_PATH) +SCRIPT_DIR = os.path.dirname(SCRIPT_PATH) + +SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir)) +TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates") +TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks") +DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads") +REPO_DIR = os.path.join(DOWNLOADS_DIR, "GAIA") + + +def download_gaia(): + """Download the GAIA benchmark from Hugging Face.""" + + if not os.path.isdir(DOWNLOADS_DIR): + os.mkdir(DOWNLOADS_DIR) + + """Download the GAIA dataset from Hugging Face Hub""" + snapshot_download( + repo_id="gaia-benchmark/GAIA", + repo_type="dataset", + local_dir=REPO_DIR, + local_dir_use_symlinks=True, + ) + + +def create_jsonl(name, tasks, files_dir, template): + """Creates a JSONL scenario file with a given name, and template path.""" + + if not os.path.isdir(TASKS_DIR): + os.mkdir(TASKS_DIR) + + with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh: + for task in tasks: + print(f"Converting: [{name}] {task['task_id']}") + + # Figure out what files we need to copy + template_cp_list = [template] + if len(task["file_name"].strip()) > 0: + template_cp_list.append( + [ + os.path.join(files_dir, task["file_name"].strip()), + task["file_name"].strip(), + #os.path.join("coding", task["file_name"].strip()), + ] + ) + + record = { + "id": task["task_id"], + "template": template_cp_list, + "substitutions": { + "scenario.py": { + "__FILE_NAME__": task["file_name"], + }, + "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]}, + "prompt.txt": {"__PROMPT__": task["Question"]}, + }, + } + + fh.write(json.dumps(record).strip() + "\n") + + +############################################################################### +def main(): + download_gaia() + + gaia_validation_files = os.path.join(REPO_DIR, "2023", "validation") + gaia_test_files = os.path.join(REPO_DIR, "2023", "test") + + if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files): + sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository.") + + # Load the GAIA data + gaia_validation_tasks = [[], [], []] + with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh: + for line in fh: + data = json.loads(line) + gaia_validation_tasks[data["Level"] - 1].append(data) + + gaia_test_tasks = [[], [], []] + with open(os.path.join(gaia_test_files, "metadata.jsonl")) as fh: + for line in fh: + data = json.loads(line) + + # A welcome message -- not a real task + if data["task_id"] == "0-0-0-0-0": + continue + + gaia_test_tasks[data["Level"] - 1].append(data) + + # list all directories in the Templates directory + # and populate a dictionary with the name and path + templates = {} + for entry in os.scandir(TEMPLATES_DIR): + if entry.is_dir(): + templates[re.sub(r"\s", "", entry.name)] = entry.path + + # Add coding directories if needed (these are usually empty and left out of the repo) + #for template in templates.values(): + # code_dir_path = os.path.join(template, "coding") + # if not os.path.isdir(code_dir_path): + # os.mkdir(code_dir_path) + + # Create the various combinations of [models] x [templates] + for t in templates.items(): + create_jsonl( + f"gaia_validation_level_1__{t[0]}", + gaia_validation_tasks[0], + gaia_validation_files, + t[1], + ) + create_jsonl( + f"gaia_validation_level_2__{t[0]}", + gaia_validation_tasks[1], + gaia_validation_files, + t[1], + ) + create_jsonl( + f"gaia_validation_level_3__{t[0]}", + gaia_validation_tasks[2], + gaia_validation_files, + t[1], + ) + create_jsonl( + f"gaia_test_level_1__{t[0]}", + gaia_test_tasks[0], + gaia_test_files, + t[1], + ) + create_jsonl( + f"gaia_test_level_2__{t[0]}", + gaia_test_tasks[1], + gaia_test_files, + t[1], + ) + create_jsonl( + f"gaia_test_level_3__{t[0]}", + gaia_test_tasks[2], + gaia_test_files, + t[1], + ) + + +if __name__ == "__main__" and __package__ is None: + main() diff --git a/python/benchmarks/GAIA/Templates/TeamOne/expected_answer.txt b/python/benchmarks/GAIA/Templates/TeamOne/expected_answer.txt new file mode 100644 index 000000000..8153c2bf8 --- /dev/null +++ b/python/benchmarks/GAIA/Templates/TeamOne/expected_answer.txt @@ -0,0 +1 @@ +__EXPECTED_ANSWER__ diff --git a/python/benchmarks/GAIA/Templates/TeamOne/prompt.txt b/python/benchmarks/GAIA/Templates/TeamOne/prompt.txt new file mode 100644 index 000000000..482f50dca --- /dev/null +++ b/python/benchmarks/GAIA/Templates/TeamOne/prompt.txt @@ -0,0 +1 @@ +__PROMPT__ diff --git a/python/benchmarks/GAIA/Templates/TeamOne/requirements.txt b/python/benchmarks/GAIA/Templates/TeamOne/requirements.txt new file mode 100644 index 000000000..ffb79dd43 --- /dev/null +++ b/python/benchmarks/GAIA/Templates/TeamOne/requirements.txt @@ -0,0 +1 @@ +/agnext/teams/team-one diff --git a/python/benchmarks/GAIA/Templates/TeamOne/scenario.py b/python/benchmarks/GAIA/Templates/TeamOne/scenario.py new file mode 100644 index 000000000..507323d87 --- /dev/null +++ b/python/benchmarks/GAIA/Templates/TeamOne/scenario.py @@ -0,0 +1,197 @@ +import asyncio +import logging +import json +import os + +from typing import Any, Dict, List, Tuple, Union +from agnext.application import SingleThreadedAgentRuntime +from agnext.application.logging import EVENT_LOGGER_NAME +from agnext.components.models import ( + AzureOpenAIChatCompletionClient, + ChatCompletionClient, + ModelCapabilities, + UserMessage, + LLMMessage, +) +from agnext.components.code_executor import LocalCommandLineCodeExecutor +from agnext.application.logging import EVENT_LOGGER_NAME +from team_one.markdown_browser import MarkdownConverter, UnsupportedFormatException +from team_one.agents.coder import Coder, Executor +from team_one.agents.orchestrator import LedgerOrchestrator +from team_one.messages import BroadcastMessage, OrchestrationEvent, RequestReplyMessage +from team_one.agents.multimodal_web_surfer import MultimodalWebSurfer +from team_one.agents.file_surfer import FileSurfer +from team_one.utils import LogHandler, message_content_to_str, create_completion_client_from_env + + +async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]): + messages: List[LLMMessage] = [ + UserMessage( + content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:", + source=source, + ) + ] + + # copy them to this context + for message in transcript: + messages.append( + UserMessage( + content = message_content_to_str(message.content), + source=message.source, + ) + ) + + # ask for the final answer + messages.append( + UserMessage( + content= f""" +Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience: + +{task} + +To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER] +Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. +ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) +If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise. +If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. +If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. +""", +#If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine' + source=source, + ) + ) + + + response = await client.create(messages) + assert isinstance(response.content, str) + + # No answer + if "unable to determine" in response.content.lower(): + messages.append( AssistantMessage(content=response.content, source="self" ) ) + messages.append( + UserMessage( + content= f""" +I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation. + +To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS] +Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc. +ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) +If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise. +If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. +If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. +""".strip(), + source=source, + ) + ) + + response = await client.create(messages) + assert isinstance(response.content, str) + return re.sub(r"EDUCATED GUESS:", "FINAL ANSWER:", response.content) + + else: + return response.content + + +async def main() -> None: + # Read the prompt + prompt = "" + with open("prompt.txt", "rt") as fh: + prompt = fh.read().strip() + filename = "__FILE_NAME__".strip() + + # Create the runtime. + runtime = SingleThreadedAgentRuntime() + + # Create the AzureOpenAI client, with AAD auth + # token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") + client = AzureOpenAIChatCompletionClient( + api_version="2024-02-15-preview", + azure_endpoint="https://aif-complex-tasks-west-us-3.openai.azure.com/", + model="gpt-4o-2024-05-13", + model_capabilities=ModelCapabilities( + function_calling=True, json_output=True, vision=True + ), + # azure_ad_token_provider=token_provider + ) + + # Register agents. + coder = runtime.register_and_get_proxy( + "Coder", + lambda: Coder(model_client=client), + ) + + executor = runtime.register_and_get_proxy( + "Executor", + lambda: Executor( + "A agent for executing code", executor=LocalCommandLineCodeExecutor() + ), + ) + + file_surfer = runtime.register_and_get_proxy( + "file_surfer", + lambda: FileSurfer(model_client=client), + ) + + web_surfer = runtime.register_and_get_proxy( + "WebSurfer", + lambda: MultimodalWebSurfer(), # Configuration is set later by init() + ) + + orchestrator = runtime.register_and_get_proxy("orchestrator", lambda: LedgerOrchestrator( + agents=[coder, executor, file_surfer, web_surfer], + model_client=client, + )) + + run_context = runtime.start() + + actual_surfer = runtime._get_agent(web_surfer.id) # type: ignore + assert isinstance(actual_surfer, MultimodalWebSurfer) + await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium") + + #await runtime.send_message(RequestReplyMessage(), user_proxy.id) + + filename_prompt = "" + if len(filename) > 0: + #relpath = os.path.join("coding", filename) + #file_uri = pathlib.Path(os.path.abspath(os.path.expanduser(relpath))).as_uri() + + filename_prompt = f"The question is about a file, document or image, which can be accessed by the filename '{filename}' in the current working directory." + + try: + mdconverter = MarkdownConverter() + res = mdconverter.convert(filename) + if res.text_content: + #if count_token(res.text_content) < 8000: # Don't put overly-large documents into the prompt + filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content + except UnsupportedFormatException: + pass + + #mdconverter = MarkdownConverter(mlm_client=client) + #mlm_prompt = f"""Write a detailed caption for this image. Pay special attention to any details that might be useful for someone answering the following: + +#{PROMPT} +#""".strip() + + task = f"{prompt}\n\n{filename_prompt}" + + await runtime.publish_message( + BroadcastMessage(content=UserMessage(content=task.strip(), source="human")), + namespace="default", + ) + + await run_context.stop_when_idle() + + # Output the final answer + actual_orchestrator = runtime._get_agent(orchestrator.id) # type: ignore + assert isinstance(actual_orchestrator, LedgerOrchestrator) + transcript: List[LLMMessage] = actual_orchestrator._chat_history # type: ignore + print(await response_preparer(task=task, source=orchestrator.metadata["name"], client=client, transcript=transcript)) + + + +if __name__ == "__main__": + logger = logging.getLogger(EVENT_LOGGER_NAME) + logger.setLevel(logging.INFO) + log_handler = LogHandler() + logger.handlers = [log_handler] + asyncio.run(main()) diff --git a/python/teams/team-one/examples/example_websurfer.py b/python/teams/team-one/examples/example_websurfer.py index e409e23ef..ff137a421 100644 --- a/python/teams/team-one/examples/example_websurfer.py +++ b/python/teams/team-one/examples/example_websurfer.py @@ -1,5 +1,6 @@ import asyncio import logging +import os from agnext.application import SingleThreadedAgentRuntime from agnext.application.logging import EVENT_LOGGER_NAME @@ -36,7 +37,7 @@ async def main() -> None: actual_surfer = runtime._get_agent(web_surfer.id) # type: ignore assert isinstance(actual_surfer, MultimodalWebSurfer) - await actual_surfer.init(model_client=client, browser_channel="chromium") + await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium") await runtime.send_message(RequestReplyMessage(), user_proxy.id) await run_context.stop_when_idle() diff --git a/python/teams/team-one/pyproject.toml b/python/teams/team-one/pyproject.toml index 4623a33b5..21a6b0bf3 100644 --- a/python/teams/team-one/pyproject.toml +++ b/python/teams/team-one/pyproject.toml @@ -23,9 +23,9 @@ dependencies = [ "markdownify", "numpy", "python-pptx", + "easyocr", "pandas", "pdfminer.six", - "easyocr", "puremagic", "binaryornot", "pydub", diff --git a/python/teams/team-one/src/team_one/agents/base_agent.py b/python/teams/team-one/src/team_one/agents/base_agent.py index 57e9cf448..0368b56bb 100644 --- a/python/teams/team-one/src/team_one/agents/base_agent.py +++ b/python/teams/team-one/src/team_one/agents/base_agent.py @@ -8,7 +8,7 @@ from agnext.components.models import ( ) from agnext.core import CancellationToken -from team_one.messages import BroadcastMessage, RequestReplyMessage, UserContent +from team_one.messages import BroadcastMessage, RequestReplyMessage, ResetMessage, UserContent from team_one.utils import message_content_to_str @@ -28,6 +28,11 @@ class BaseAgent(TypeRoutedAgent): assert isinstance(message.content, UserMessage) self._chat_history.append(message.content) + @message_handler + async def handle_reset(self, message: ResetMessage, cancellation_token: CancellationToken) -> None: + """Handle a reset message.""" + await self._reset(cancellation_token) + @message_handler async def handle_request_reply(self, message: RequestReplyMessage, cancellation_token: CancellationToken) -> None: """Respond to a reply request.""" @@ -42,3 +47,6 @@ class BaseAgent(TypeRoutedAgent): async def _generate_reply(self, cancellation_token: CancellationToken) -> Tuple[bool, UserContent]: """Returns (request_halt, response_message)""" raise NotImplementedError() + + async def _reset(self, cancellation_token: CancellationToken) -> None: + self._chat_history = [] diff --git a/python/teams/team-one/src/team_one/agents/coder.py b/python/teams/team-one/src/team_one/agents/coder.py index 824e627b2..0df818bc7 100644 --- a/python/teams/team-one/src/team_one/agents/coder.py +++ b/python/teams/team-one/src/team_one/agents/coder.py @@ -20,7 +20,7 @@ class Coder(BaseAgent): DEFAULT_DESCRIPTION = "A Python coder assistant." DEFAULT_SYSTEM_MESSAGES = [ - SystemMessage("""You are a helpful AI assistant. Solve tasks using your Python coding skills. The code you output must be formatted in Markdown code blocks demarcated by triple backticks (```). As an example: + SystemMessage("""You are a helpful AI assistant. Solve tasks using your Python coding skills. The code you output must be formatted in Markdown code blocks demarcated by triple backticks (```), and must print their final output to console. As an example: ```python @@ -86,10 +86,18 @@ class Executor(BaseAgent): ) cancellation_token.link_future(future) result = await future - return ( - False, - f"The script ran, then exited with Unix exit code: {result.exit_code}\nIts output was:\n{result.output}", - ) + + if result.output.strip() == "": + # Sometimes agents forget to print(). Remind the to print something + return ( + False, + f"The script ran but produced no output to console. The Unix exit code was: {result.exit_code}. If you were expecting output, consider revising the script to ensure content is printed to stdout.", + ) + else: + return ( + False, + f"The script ran, then exited with Unix exit code: {result.exit_code}\nIts output was:\n{result.output}", + ) else: n_messages_checked += 1 if n_messages_checked > self._check_last_n_message: diff --git a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py index 6391f3b02..5d40180f3 100644 --- a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py +++ b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/multimodal_web_surfer.py @@ -2,6 +2,7 @@ import base64 import hashlib import io import json +import logging import os import pathlib import re @@ -10,6 +11,7 @@ from typing import Any, BinaryIO, Dict, List, Tuple, Union, cast # Any, Callabl from urllib.parse import quote_plus # parse_qs, quote, unquote, urlparse, urlunparse import aiofiles +from agnext.application.logging import EVENT_LOGGER_NAME from agnext.components import Image as AGImage from agnext.components.models import ( AssistantMessage, @@ -24,10 +26,10 @@ from playwright._impl._errors import Error as PlaywrightError from playwright._impl._errors import TimeoutError # from playwright._impl._async_base.AsyncEventInfo -from playwright.async_api import BrowserContext, Page, Playwright, async_playwright +from playwright.async_api import BrowserContext, Download, Page, Playwright, async_playwright from team_one.agents.base_agent import BaseAgent -from team_one.messages import UserContent +from team_one.messages import UserContent, WebSurferEvent from team_one.utils import SentinelMeta, message_content_to_str # TODO: Fix mdconvert @@ -68,6 +70,8 @@ VIEWPORT_WIDTH = 1440 MLM_HEIGHT = 765 MLM_WIDTH = 1224 +logger = logging.getLogger(EVENT_LOGGER_NAME + ".MultimodalWebSurfer") + # Sentinels class DEFAULT_CHANNEL(metaclass=SentinelMeta): @@ -92,6 +96,7 @@ class MultimodalWebSurfer(BaseAgent): self._playwright: Playwright | None = None self._context: BrowserContext | None = None self._page: Page | None = None + self._last_download: Download | None = None self._prior_metadata_hash: str | None = None # Read page_script @@ -99,6 +104,12 @@ class MultimodalWebSurfer(BaseAgent): with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh: self._page_script = fh.read() + # Define the download handler + def _download_handler(download: Download) -> None: + self._last_download = download + + self._download_handler = _download_handler + async def init( self, model_client: ChatCompletionClient, @@ -115,12 +126,7 @@ class MultimodalWebSurfer(BaseAgent): self.start_page = start_page or self.DEFAULT_START_PAGE self.downloads_folder = downloads_folder self._chat_history: List[LLMMessage] = [] - - # def _download_handler(download): - # self._last_download = download - # - # self._download_handler = _download_handler - # self._last_download = None + self._last_download = None self._prior_metadata_hash = None ## Create or use the provided MarkdownConverter @@ -148,14 +154,13 @@ class MultimodalWebSurfer(BaseAgent): self._context.set_default_timeout(60000) # One minute self._page = await self._context.new_page() # self._page.route(lambda x: True, self._route_handler) - # self._page.on("download", self._download_handler) + self._page.on("download", self._download_handler) await self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT}) await self._page.add_init_script( path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js") ) await self._page.goto(self.start_page) await self._page.wait_for_load_state() - # self._sleep(1) # Prepare the debug directory -- which stores the screenshots generated throughout the process await self._set_debug_dir(debug_dir) @@ -194,15 +199,21 @@ setInterval(function() {{ await self._page.screenshot(path=os.path.join(self.debug_dir, "screenshot.png")) print(f"Multimodal Web Surfer debug screens: {pathlib.Path(os.path.abspath(debug_html)).as_uri()}\n") - # def reset(self): - # super().reset() - # self._log_to_console(fname="reset", args={"home": self.start_page}) - # self._visit_page(self.start_page) - # self._page.wait_for_load_state() - # if self.debug_dir: - # screenshot = self._page.screenshot() - # with open(os.path.join(self.debug_dir, "screenshot.png"), "wb") as png: - # png.write(screenshot) + async def _reset(self, cancellation_token: CancellationToken) -> None: + assert self._page is not None + future = super()._reset(cancellation_token) + await future + await self._visit_page(self.start_page) + await self._page.wait_for_load_state() + if self.debug_dir: + await self._page.screenshot(path=os.path.join(self.debug_dir, "screenshot.png")) + logger.info( + WebSurferEvent( + source=self.metadata["name"], + url=self._page.url, + message="Resetting browser.", + ) + ) def _target_name(self, target: str, rects: Dict[str, InteractiveRegion]) -> str | None: try: @@ -358,122 +369,131 @@ When deciding between tools, consider if the request can be best addressed by: message = response.content action_description = "" - # self._last_download = None - # try: - if True: - if isinstance(message, str): - # Answer directly - return False, message + self._last_download = None - elif isinstance(message, list): - # Take an action + if isinstance(message, str): + # Answer directly + return False, message - name = message[0].name - args = json.loads(message[0].arguments) + elif isinstance(message, list): + # Take an action - if name == "visit_url": - url = args.get("url") - action_description = f"I typed '{url}' into the browser address bar." - # Check if the argument starts with a known protocol - if url.startswith(("https://", "http://", "file://", "about:")): - await self._visit_page(url) - # If the argument contains a space, treat it as a search query - elif " " in url: - await self._visit_page(f"https://www.bing.com/search?q={quote_plus(url)}&FORM=QBLH") - # Otherwise, prefix with https:// - else: - await self._visit_page("https://" + url) + name = message[0].name + args = json.loads(message[0].arguments) - elif name == "history_back": - action_description = "I clicked the browser back button." - await self._back() - - elif name == "web_search": - query = args.get("query") - action_description = f"I typed '{query}' into the browser search bar." - await self._visit_page(f"https://www.bing.com/search?q={quote_plus(query)}&FORM=QBLH") - - elif name == "page_up": - action_description = "I scrolled up one page in the browser." - await self._page_up() - - elif name == "page_down": - action_description = "I scrolled down one page in the browser." - await self._page_down() - - elif name == "click": - target_id = str(args.get("target_id")) - target_name = self._target_name(target_id, rects) - if target_name: - action_description = f"I clicked '{target_name}'." - else: - action_description = "I clicked the control." - await self._click_id(target_id) - - elif name == "input_text": - input_field_id = str(args.get("input_field_id")) - text_value = str(args.get("text_value")) - input_field_name = self._target_name(input_field_id, rects) - if input_field_name: - action_description = f"I typed '{text_value}' into '{input_field_name}'." - else: - action_description = f"I input '{text_value}'." - await self._fill_id(input_field_id, text_value) - - elif name == "scroll_element_up": - target_id = str(args.get("target_id")) - target_name = self._target_name(target_id, rects) - - if target_name: - action_description = f"I scrolled '{target_name}' up." - else: - action_description = "I scrolled the control up." - - await self._scroll_id(target_id, "up") - - elif name == "scroll_element_down": - target_id = str(args.get("target_id")) - target_name = self._target_name(target_id, rects) - - if target_name: - action_description = f"I scrolled '{target_name}' down." - else: - action_description = "I scrolled the control down." - - await self._scroll_id(target_id, "down") - - elif name == "answer_question": - question = str(args.get("question")) - # Do Q&A on the DOM. No need to take further action. Browser state does not change. - return False, await self._summarize_page(question=question) - - elif name == "summarize_page": - # Summarize the DOM. No need to take further action. Browser state does not change. - return False, await self._summarize_page() - - elif name == "sleep": - action_description = "I am waiting a short period of time before taking further action." - await self._sleep(3) # There's a 2s sleep below too + logger.info( + WebSurferEvent( + source=self.metadata["name"], + url=self._page.url, + action=name, + arguments=args, + message=f"{name}( {json.dumps(args)} )", + ) + ) + if name == "visit_url": + url = args.get("url") + action_description = f"I typed '{url}' into the browser address bar." + # Check if the argument starts with a known protocol + if url.startswith(("https://", "http://", "file://", "about:")): + await self._visit_page(url) + # If the argument contains a space, treat it as a search query + elif " " in url: + await self._visit_page(f"https://www.bing.com/search?q={quote_plus(url)}&FORM=QBLH") + # Otherwise, prefix with https:// else: - raise ValueError(f"Unknown tool '{name}'. Please choose from:\n\n{tool_names}") - else: - # Not sure what happened here - raise AssertionError(f"Unknown response format '{message}'") + await self._visit_page("https://" + url) - # except ValueError as e: - # return True, str(e) + elif name == "history_back": + action_description = "I clicked the browser back button." + await self._back() + + elif name == "web_search": + query = args.get("query") + action_description = f"I typed '{query}' into the browser search bar." + await self._visit_page(f"https://www.bing.com/search?q={quote_plus(query)}&FORM=QBLH") + + elif name == "page_up": + action_description = "I scrolled up one page in the browser." + await self._page_up() + + elif name == "page_down": + action_description = "I scrolled down one page in the browser." + await self._page_down() + + elif name == "click": + target_id = str(args.get("target_id")) + target_name = self._target_name(target_id, rects) + if target_name: + action_description = f"I clicked '{target_name}'." + else: + action_description = "I clicked the control." + await self._click_id(target_id) + + elif name == "input_text": + input_field_id = str(args.get("input_field_id")) + text_value = str(args.get("text_value")) + input_field_name = self._target_name(input_field_id, rects) + if input_field_name: + action_description = f"I typed '{text_value}' into '{input_field_name}'." + else: + action_description = f"I input '{text_value}'." + await self._fill_id(input_field_id, text_value) + + elif name == "scroll_element_up": + target_id = str(args.get("target_id")) + target_name = self._target_name(target_id, rects) + + if target_name: + action_description = f"I scrolled '{target_name}' up." + else: + action_description = "I scrolled the control up." + + await self._scroll_id(target_id, "up") + + elif name == "scroll_element_down": + target_id = str(args.get("target_id")) + target_name = self._target_name(target_id, rects) + + if target_name: + action_description = f"I scrolled '{target_name}' down." + else: + action_description = "I scrolled the control down." + + await self._scroll_id(target_id, "down") + + elif name == "answer_question": + question = str(args.get("question")) + # Do Q&A on the DOM. No need to take further action. Browser state does not change. + return False, await self._summarize_page(question=question) + + elif name == "summarize_page": + # Summarize the DOM. No need to take further action. Browser state does not change. + return False, await self._summarize_page() + + elif name == "sleep": + action_description = "I am waiting a short period of time before taking further action." + await self._sleep(3) # There's a 2s sleep below too + + else: + raise ValueError(f"Unknown tool '{name}'. Please choose from:\n\n{tool_names}") + else: + # Not sure what happened here + raise AssertionError(f"Unknown response format '{message}'") await self._page.wait_for_load_state() await self._sleep(3) - # # Handle downloads - # if self._last_download is not None and self.downloads_folder is not None: - # fname = os.path.join(self.downloads_folder, self._last_download.suggested_filename) - # self._last_download.save_as(fname) - # page_body = f"Download Successful

Successfully downloaded '{self._last_download.suggested_filename}' to local path:

{fname}

" - # self._page.goto("data:text/html;base64," + base64.b64encode(page_body.encode("utf-8")).decode("utf-8")) - # self._page.wait_for_load_state() + # Handle downloads + if self._last_download is not None and self.downloads_folder is not None: + fname = os.path.join(self.downloads_folder, self._last_download.suggested_filename) + # TODO: Fix this type + await self._last_download.save_as(fname) # type: ignore + page_body = f"Download Successful

Successfully downloaded '{self._last_download.suggested_filename}' to local path:

{fname}

" + await self._page.goto( + "data:text/html;base64," + base64.b64encode(page_body.encode("utf-8")).decode("utf-8") + ) + await self._page.wait_for_load_state() # Handle metadata page_metadata = json.dumps(await self._get_page_metadata(), indent=4) @@ -571,7 +591,7 @@ When deciding between tools, consider if the request can be best addressed by: async def _on_new_page(self, page: Page) -> None: self._page = page # self._page.route(lambda x: True, self._route_handler) - # self._page.on("download", self._download_handler) + self._page.on("download", self._download_handler) await self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT}) await self._sleep(0.2) self._prior_metadata_hash = None @@ -644,6 +664,15 @@ When deciding between tools, consider if the request can be best addressed by: assert isinstance(new_page, Page) await self._on_new_page(new_page) + + logger.info( + WebSurferEvent( + source=self.metadata["name"], + url=self._page.url, + message="New tab or window.", + ) + ) + except TimeoutError: pass diff --git a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/tool_definitions.py b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/tool_definitions.py index 7f5fe71fa..e23161665 100644 --- a/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/tool_definitions.py +++ b/python/teams/team-one/src/team_one/agents/multimodal_web_surfer/tool_definitions.py @@ -21,7 +21,7 @@ TOOL_VISIT_URL: ToolSchema = _load_tool( "type": "function", "function": { "name": "visit_url", - "description": "Inputs the given url into the browser's address bar, navigating directly to the requested page.", + "description": "Navigate directly to a provided URL using the browser's address bar. Prefer this tool over other navigation techniques in cases where the user provides a fully-qualified URL (e.g., choose it over clicking links, or inputing queries into search boxes).", "parameters": { "type": "object", "properties": { diff --git a/python/teams/team-one/src/team_one/agents/orchestrator.py b/python/teams/team-one/src/team_one/agents/orchestrator.py index bf32aa1cd..ef77aaecb 100644 --- a/python/teams/team-one/src/team_one/agents/orchestrator.py +++ b/python/teams/team-one/src/team_one/agents/orchestrator.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional from agnext.components.models import AssistantMessage, ChatCompletionClient, LLMMessage, SystemMessage, UserMessage from agnext.core import AgentProxy -from ..messages import BroadcastMessage, OrchestrationEvent +from ..messages import BroadcastMessage, OrchestrationEvent, ResetMessage from .base_orchestrator import BaseOrchestrator, logger from .orchestrator_prompts import ( ORCHESTRATOR_CLOSED_BOOK_PROMPT, @@ -186,6 +186,10 @@ class LedgerOrchestrator(BaseOrchestrator): f"New plan:\n{plan_str}", ) ) + + # Reset + self._chat_history = [self._chat_history[0]] + await self.publish_message(ResetMessage()) self._chat_history.append(plan_user_message) ledger_dict = await self.update_ledger() diff --git a/python/teams/team-one/src/team_one/messages.py b/python/teams/team-one/src/team_one/messages.py index 597da5a9c..0623760a2 100644 --- a/python/teams/team-one/src/team_one/messages.py +++ b/python/teams/team-one/src/team_one/messages.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Union +from typing import Any, Dict, List, Union from agnext.components import FunctionCall, Image from agnext.components.models import FunctionExecutionResult, LLMMessage @@ -22,7 +22,21 @@ class RequestReplyMessage: pass +@dataclass +class ResetMessage: + pass + + @dataclass class OrchestrationEvent: source: str message: str + + +@dataclass +class WebSurferEvent: + source: str + message: str + url: str + action: str | None = None + arguments: Dict[str, Any] | None = None diff --git a/python/teams/team-one/src/team_one/utils.py b/python/teams/team-one/src/team_one/utils.py index ecabbd4ed..c86660422 100644 --- a/python/teams/team-one/src/team_one/utils.py +++ b/python/teams/team-one/src/team_one/utils.py @@ -1,10 +1,12 @@ import json import logging import os +from dataclasses import asdict from datetime import datetime from typing import Any, Dict, List, Literal from agnext.application.logging.events import LLMCallEvent +from agnext.components import Image from agnext.components.models import ( AzureOpenAIChatCompletionClient, ChatCompletionClient, @@ -12,7 +14,14 @@ from agnext.components.models import ( OpenAIChatCompletionClient, ) -from .messages import AssistantContent, FunctionExecutionContent, OrchestrationEvent, SystemContent, UserContent +from .messages import ( + AssistantContent, + FunctionExecutionContent, + OrchestrationEvent, + SystemContent, + UserContent, + WebSurferEvent, +) ENVIRON_KEY_CHAT_COMPLETION_PROVIDER = "CHAT_COMPLETION_PROVIDER" ENVIRON_KEY_CHAT_COMPLETION_KWARGS_JSON = "CHAT_COMPLETION_KWARGS_JSON" @@ -82,6 +91,8 @@ def message_content_to_str( for item in message_content: if isinstance(item, str): converted.append(item.rstrip()) + elif isinstance(item, Image): + converted.append("") else: converted.append(str(item).rstrip()) return "\n".join(converted) @@ -95,7 +106,8 @@ class LogHandler(logging.FileHandler): super().__init__(filename) def emit(self, record: logging.LogRecord) -> None: - try: + # try: + if True: ts = datetime.fromtimestamp(record.created).isoformat() if isinstance(record.msg, OrchestrationEvent): console_message = ( @@ -111,6 +123,16 @@ class LogHandler(logging.FileHandler): } ) super().emit(record) + if isinstance(record.msg, WebSurferEvent): + console_message = f"\033[96m[{ts}], {record.msg.source}: {record.msg.message}\033[0m" + print(console_message, flush=True) + payload: Dict[str, Any] = { + "timestamp": ts, + "type": "WebSurferEvent", + } + payload.update(asdict(record.msg)) + record.msg = json.dumps(payload) + super().emit(record) if isinstance(record.msg, LLMCallEvent): record.msg = json.dumps( { @@ -120,9 +142,9 @@ class LogHandler(logging.FileHandler): "type": "LLMCallEvent", } ) - super().emit(record) - except Exception: - self.handleError(record) + super().emit(record) + # except Exception: + # self.handleError(record) class SentinelMeta(type): diff --git a/python/tools/agbench/src/agbench/res/Dockerfile b/python/tools/agbench/src/agbench/res/Dockerfile index 4f4286a2e..d0700b511 100644 --- a/python/tools/agbench/src/agbench/res/Dockerfile +++ b/python/tools/agbench/src/agbench/res/Dockerfile @@ -18,8 +18,7 @@ RUN pip install openai pillow aiohttp typing-extensions pydantic types-aiofiles RUN pip install numpy pandas matplotlib seaborn scikit-learn requests urllib3 nltk pytest # Pre-load packages needed for mdconvert file utils -RUN pip install python-docx pdfminer.six python-pptx SpeechRecognition openpyxl pydub mammoth puremagic youtube_transcript_api==0.6.0 -# easyocr +RUN pip install python-docx pdfminer.six python-pptx SpeechRecognition openpyxl pydub mammoth puremagic youtube_transcript_api==0.6.0 easyocr # Pre-load Playwright RUN pip install playwright @@ -30,7 +29,7 @@ RUN pip uninstall --yes numpy RUN pip install "numpy<2.0" # Pre-load the OCR model -#RUN /usr/bin/echo -e "import easyocr\nreader = easyocr.Reader(['en'])" | python +RUN /usr/bin/echo -e "import easyocr\nreader = easyocr.Reader(['en'])" | python # Webarena (evaluation code) RUN pip install beartype aiolimiter