TeamOne implementation of GAIA (#221)

Port of GAIA benchmark
This commit is contained in:
afourney
2024-07-17 09:51:19 -07:00
committed by GitHub
parent e69dd92c4f
commit 211bfa01c3
17 changed files with 790 additions and 144 deletions

View File

@@ -0,0 +1,7 @@
{
"CHAT_COMPLETION_PROVIDER": "azure",
"CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-05-01-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT_HERE\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
"BING_API_KEY": "YOUR_KEY_KEY",
"HOMEPAGE": "https://www.bing.com/",
"WEB_SURFER_DEBUG_DIR": "/autogen/debug"
}

View File

@@ -0,0 +1,197 @@
import os
import sys
import re
from agbench.tabulate_cmd import default_tabulate
import json
import pandas as pd
import sqlite3
import glob
import numpy as np
EXCLUDE_DIR_NAMES = ["__pycache__"]
def normalize_answer(a):
# Lower case
# Trim (left and right)
# standardize comma separated values
# Replace multiple spaces with one space
# Remove trailing punctuation
norm_answer = ", ".join(a.strip().lower().split(","))
norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
return norm_answer
def scorer(instance_dir):
# Read the expected answer
expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
if not os.path.isfile(expected_answer_file):
return None
expected_answer = None
with open(expected_answer_file, "rt") as fh:
expected_answer = fh.read().strip()
# Read the console
console_log_file = os.path.join(instance_dir, "console_log.txt")
if not os.path.isfile(console_log_file):
return None
console_log = ""
with open(console_log_file, "rt") as fh:
console_log = fh.read()
final_answer = None
m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
if m:
final_answer = m.group(1).strip()
# Missing the final answer line
if final_answer is None:
return None
# Return true if they are equal after normalization
n_ex = normalize_answer(expected_answer)
n_final = normalize_answer(final_answer)
return (
(n_ex != "" and n_ex == n_final),
n_ex,
n_final
)
def get_number_of_chat_messages(chat_messages_dir):
result = 0
for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
with open(file, "r") as f:
content = json.load(f)
for agent, messages in content.items():
result += len(messages)
return result
def main(args):
parsed_args, all_results = default_tabulate(args, scorer=scorer)
excel_path = parsed_args.excel
if excel_path:
excel_dir = os.path.dirname(excel_path) or "."
if not os.path.exists(excel_dir):
os.makedirs(excel_dir, exist_ok=True)
if not excel_path.endswith((".xlsx", ".xls")):
excel_path += ".xlsx"
runlogs = parsed_args.runlogs if parsed_args.runlogs.endswith("/") else parsed_args.runlogs + "/"
if os.path.isdir(runlogs):
task_ids = sorted(
[task_id for task_id in os.listdir(runlogs) if task_id not in EXCLUDE_DIR_NAMES],
key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
)
else:
raise ValueError("please input a valid directory to tabulate result")
trials = sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) if len(task_ids) > 0 else []
dbnames = [[f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids] for trial in trials]
query = """
SELECT cost, session_id, response, start_time, end_time
FROM (
SELECT invocation_id, cost, session_id, response, start_time, end_time,
ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
FROM chat_completions
)
WHERE rn = 1;
"""
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
for trial_index, each_trial in enumerate(dbnames):
result_df = pd.DataFrame(
columns=[
"id",
"status",
"expected_answer",
"final_answer",
"cost",
"latency",
"num_of_llm_requests",
"num_of_chat_messages",
"prompt_tokens",
"completion_tokens",
"total_tokens",
"model",
]
)
result_df_type_mapping = {
"id": str,
"status": bool,
"expected_answer": str,
"final_answer": str,
"cost": float,
"latency": float,
"num_of_llm_requests": int,
"num_of_chat_messages": int,
"prompt_tokens": int,
"completion_tokens": int,
"total_tokens": int,
}
for dbname, scorer_results in zip(each_trial, all_results):
task_id = scorer_results[0]
scorer_result = scorer_results[trial_index + 1]
status, expected_answer, final_answer = scorer_result if scorer_result else (False,"","")
con = sqlite3.connect(dbname)
# TODO: if large amount of data, add chunksize
telemetry_df = pd.read_sql_query(query, con)
earliest_starttime = pd.to_datetime(telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f").min()
latest_endtime = pd.to_datetime(telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f").max()
num_of_chat_messages = get_number_of_chat_messages(chat_messages_dir=os.path.dirname(dbname))
result = {
"id": task_id,
"status": status,
"expected_answer": expected_answer,
"final_answer": final_answer,
"cost": telemetry_df["cost"].sum(),
"latency": (latest_endtime - earliest_starttime).total_seconds(),
"num_of_llm_requests": len(telemetry_df),
"num_of_chat_messages": num_of_chat_messages,
"prompt_tokens": telemetry_df["response"]
.apply(
lambda x: json.loads(x)["usage"]["prompt_tokens"]
if "usage" in json.loads(x) and "prompt_tokens" in json.loads(x)["usage"]
else 0
)
.sum(),
"completion_tokens": telemetry_df["response"]
.apply(
lambda x: json.loads(x)["usage"]["completion_tokens"]
if "usage" in json.loads(x) and "completion_tokens" in json.loads(x)["usage"]
else 0
)
.sum(),
"total_tokens": telemetry_df["response"]
.apply(
lambda x: json.loads(x)["usage"]["total_tokens"]
if "usage" in json.loads(x) and "total_tokens" in json.loads(x)["usage"]
else 0
)
.sum(),
"model": telemetry_df["response"]
.apply(lambda x: json.loads(x)["model"] if "model" in json.loads(x) else "")
.unique(),
}
result_df = result_df.astype(result_df_type_mapping)
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_excel(writer, sheet_name=f"trial_{trial_index}", index=False)
if __name__ == "__main__" and __package__ is None:
main(sys.argv)

View File

@@ -0,0 +1,157 @@
#
# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
#
import json
import os
import re
import sys
from huggingface_hub import snapshot_download
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
REPO_DIR = os.path.join(DOWNLOADS_DIR, "GAIA")
def download_gaia():
"""Download the GAIA benchmark from Hugging Face."""
if not os.path.isdir(DOWNLOADS_DIR):
os.mkdir(DOWNLOADS_DIR)
"""Download the GAIA dataset from Hugging Face Hub"""
snapshot_download(
repo_id="gaia-benchmark/GAIA",
repo_type="dataset",
local_dir=REPO_DIR,
local_dir_use_symlinks=True,
)
def create_jsonl(name, tasks, files_dir, template):
"""Creates a JSONL scenario file with a given name, and template path."""
if not os.path.isdir(TASKS_DIR):
os.mkdir(TASKS_DIR)
with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
for task in tasks:
print(f"Converting: [{name}] {task['task_id']}")
# Figure out what files we need to copy
template_cp_list = [template]
if len(task["file_name"].strip()) > 0:
template_cp_list.append(
[
os.path.join(files_dir, task["file_name"].strip()),
task["file_name"].strip(),
#os.path.join("coding", task["file_name"].strip()),
]
)
record = {
"id": task["task_id"],
"template": template_cp_list,
"substitutions": {
"scenario.py": {
"__FILE_NAME__": task["file_name"],
},
"expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
"prompt.txt": {"__PROMPT__": task["Question"]},
},
}
fh.write(json.dumps(record).strip() + "\n")
###############################################################################
def main():
download_gaia()
gaia_validation_files = os.path.join(REPO_DIR, "2023", "validation")
gaia_test_files = os.path.join(REPO_DIR, "2023", "test")
if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository.")
# Load the GAIA data
gaia_validation_tasks = [[], [], []]
with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:
for line in fh:
data = json.loads(line)
gaia_validation_tasks[data["Level"] - 1].append(data)
gaia_test_tasks = [[], [], []]
with open(os.path.join(gaia_test_files, "metadata.jsonl")) as fh:
for line in fh:
data = json.loads(line)
# A welcome message -- not a real task
if data["task_id"] == "0-0-0-0-0":
continue
gaia_test_tasks[data["Level"] - 1].append(data)
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
# Add coding directories if needed (these are usually empty and left out of the repo)
#for template in templates.values():
# code_dir_path = os.path.join(template, "coding")
# if not os.path.isdir(code_dir_path):
# os.mkdir(code_dir_path)
# Create the various combinations of [models] x [templates]
for t in templates.items():
create_jsonl(
f"gaia_validation_level_1__{t[0]}",
gaia_validation_tasks[0],
gaia_validation_files,
t[1],
)
create_jsonl(
f"gaia_validation_level_2__{t[0]}",
gaia_validation_tasks[1],
gaia_validation_files,
t[1],
)
create_jsonl(
f"gaia_validation_level_3__{t[0]}",
gaia_validation_tasks[2],
gaia_validation_files,
t[1],
)
create_jsonl(
f"gaia_test_level_1__{t[0]}",
gaia_test_tasks[0],
gaia_test_files,
t[1],
)
create_jsonl(
f"gaia_test_level_2__{t[0]}",
gaia_test_tasks[1],
gaia_test_files,
t[1],
)
create_jsonl(
f"gaia_test_level_3__{t[0]}",
gaia_test_tasks[2],
gaia_test_files,
t[1],
)
if __name__ == "__main__" and __package__ is None:
main()

View File

@@ -0,0 +1 @@
__EXPECTED_ANSWER__

View File

@@ -0,0 +1 @@
__PROMPT__

View File

@@ -0,0 +1 @@
/agnext/teams/team-one

View File

@@ -0,0 +1,197 @@
import asyncio
import logging
import json
import os
from typing import Any, Dict, List, Tuple, Union
from agnext.application import SingleThreadedAgentRuntime
from agnext.application.logging import EVENT_LOGGER_NAME
from agnext.components.models import (
AzureOpenAIChatCompletionClient,
ChatCompletionClient,
ModelCapabilities,
UserMessage,
LLMMessage,
)
from agnext.components.code_executor import LocalCommandLineCodeExecutor
from agnext.application.logging import EVENT_LOGGER_NAME
from team_one.markdown_browser import MarkdownConverter, UnsupportedFormatException
from team_one.agents.coder import Coder, Executor
from team_one.agents.orchestrator import LedgerOrchestrator
from team_one.messages import BroadcastMessage, OrchestrationEvent, RequestReplyMessage
from team_one.agents.multimodal_web_surfer import MultimodalWebSurfer
from team_one.agents.file_surfer import FileSurfer
from team_one.utils import LogHandler, message_content_to_str, create_completion_client_from_env
async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]):
messages: List[LLMMessage] = [
UserMessage(
content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:",
source=source,
)
]
# copy them to this context
for message in transcript:
messages.append(
UserMessage(
content = message_content_to_str(message.content),
source=message.source,
)
)
# ask for the final answer
messages.append(
UserMessage(
content= f"""
Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
{task}
To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
""",
#If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
source=source,
)
)
response = await client.create(messages)
assert isinstance(response.content, str)
# No answer
if "unable to determine" in response.content.lower():
messages.append( AssistantMessage(content=response.content, source="self" ) )
messages.append(
UserMessage(
content= f"""
I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
""".strip(),
source=source,
)
)
response = await client.create(messages)
assert isinstance(response.content, str)
return re.sub(r"EDUCATED GUESS:", "FINAL ANSWER:", response.content)
else:
return response.content
async def main() -> None:
# Read the prompt
prompt = ""
with open("prompt.txt", "rt") as fh:
prompt = fh.read().strip()
filename = "__FILE_NAME__".strip()
# Create the runtime.
runtime = SingleThreadedAgentRuntime()
# Create the AzureOpenAI client, with AAD auth
# token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
client = AzureOpenAIChatCompletionClient(
api_version="2024-02-15-preview",
azure_endpoint="https://aif-complex-tasks-west-us-3.openai.azure.com/",
model="gpt-4o-2024-05-13",
model_capabilities=ModelCapabilities(
function_calling=True, json_output=True, vision=True
),
# azure_ad_token_provider=token_provider
)
# Register agents.
coder = runtime.register_and_get_proxy(
"Coder",
lambda: Coder(model_client=client),
)
executor = runtime.register_and_get_proxy(
"Executor",
lambda: Executor(
"A agent for executing code", executor=LocalCommandLineCodeExecutor()
),
)
file_surfer = runtime.register_and_get_proxy(
"file_surfer",
lambda: FileSurfer(model_client=client),
)
web_surfer = runtime.register_and_get_proxy(
"WebSurfer",
lambda: MultimodalWebSurfer(), # Configuration is set later by init()
)
orchestrator = runtime.register_and_get_proxy("orchestrator", lambda: LedgerOrchestrator(
agents=[coder, executor, file_surfer, web_surfer],
model_client=client,
))
run_context = runtime.start()
actual_surfer = runtime._get_agent(web_surfer.id) # type: ignore
assert isinstance(actual_surfer, MultimodalWebSurfer)
await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium")
#await runtime.send_message(RequestReplyMessage(), user_proxy.id)
filename_prompt = ""
if len(filename) > 0:
#relpath = os.path.join("coding", filename)
#file_uri = pathlib.Path(os.path.abspath(os.path.expanduser(relpath))).as_uri()
filename_prompt = f"The question is about a file, document or image, which can be accessed by the filename '{filename}' in the current working directory."
try:
mdconverter = MarkdownConverter()
res = mdconverter.convert(filename)
if res.text_content:
#if count_token(res.text_content) < 8000: # Don't put overly-large documents into the prompt
filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content
except UnsupportedFormatException:
pass
#mdconverter = MarkdownConverter(mlm_client=client)
#mlm_prompt = f"""Write a detailed caption for this image. Pay special attention to any details that might be useful for someone answering the following:
#{PROMPT}
#""".strip()
task = f"{prompt}\n\n{filename_prompt}"
await runtime.publish_message(
BroadcastMessage(content=UserMessage(content=task.strip(), source="human")),
namespace="default",
)
await run_context.stop_when_idle()
# Output the final answer
actual_orchestrator = runtime._get_agent(orchestrator.id) # type: ignore
assert isinstance(actual_orchestrator, LedgerOrchestrator)
transcript: List[LLMMessage] = actual_orchestrator._chat_history # type: ignore
print(await response_preparer(task=task, source=orchestrator.metadata["name"], client=client, transcript=transcript))
if __name__ == "__main__":
logger = logging.getLogger(EVENT_LOGGER_NAME)
logger.setLevel(logging.INFO)
log_handler = LogHandler()
logger.handlers = [log_handler]
asyncio.run(main())