combined reports with json (#349)

This commit is contained in:
Silen Naihin
2023-09-02 22:28:47 -07:00
committed by GitHub
parent 4c236b16e4
commit cd7c6139c1
8 changed files with 868 additions and 76 deletions

5
.gitignore vendored
View File

@@ -1,8 +1,7 @@
agbenchmark/workspace/
backend/backend_stdout.txt
reports/df.pkl
reports/df_backwards.pkl
reports/reports_raw.pkl
reports/df*.pkl
reports/raw*
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@@ -235,9 +235,6 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
json_file = (
json_files.popleft()
) # Take and remove the first element from json_files
if challenge_should_be_ignored(json_file):
continue
data = ChallengeData.get_json_from_path(json_file)
suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))
@@ -296,8 +293,4 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
print(f"Generated test for {data['name']}.")
def challenge_should_be_ignored(json_file):
return "challenges/deprecated" in json_file or "challenges/library" in json_file
generate_tests()

View File

@@ -9,6 +9,7 @@ class Metrics(BaseModel):
success_percent: float = Field(..., alias="success_%")
run_time: Optional[str] = None
fail_reason: Optional[str] = None
attempted: Optional[bool] = None
class MetricsOverall(BaseModel):

View File

@@ -95,8 +95,7 @@ def run_benchmark(
test: Optional[str] = None,
suite: Optional[str] = None,
cutoff: Optional[int] = None,
api_mode: bool = False,
host: Optional[str] = None,
server: bool = False,
) -> int:
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
# Check if configuration file exists and is not empty
@@ -133,12 +132,7 @@ def run_benchmark(
config = json.load(f)
else:
config = {}
host = host or config.get("host")
api_mode = api_mode or config.get("api_mode")
if host:
config["host"] = host
if api_mode:
config["api_mode"] = api_mode
print("benchmark run path", CONFIG_PATH, HOME_DIRECTORY)
if not config.get("workspace"):
config["workspace"] = click.prompt(
@@ -147,7 +141,7 @@ def run_benchmark(
show_default=True,
)
if api_mode and not host:
if config.get("api_mode") and not config.get("host"):
config["host"] = click.prompt(
"Please enter the Agent API host address",
default="http://localhost:8000",
@@ -201,10 +195,7 @@ def run_benchmark(
elif explore:
print("Only attempt challenges that have never been beaten")
pytest_args.append("--explore")
if host:
pytest_args.append(f"--host={host}")
if api_mode:
pytest_args.append("--api_mode")
if mock:
pytest_args.append("--mock")
@@ -224,8 +215,6 @@ def run_benchmark(
print(f"Setting cuttoff override to {cutoff} seconds.")
pytest_args.extend((str(CURRENT_DIRECTORY), "--cache-clear"))
pytest_args.append("--disable-warnings")
return pytest.main(pytest_args)
@@ -260,8 +249,6 @@ def cli() -> None:
)
@click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
@click.option("--api_mode", help="API mode")
@click.option("--host", help="Define API host")
def start(
maintain: bool,
improve: bool,
@@ -275,8 +262,6 @@ def start(
suite: Optional[str] = None,
cutoff: Optional[int] = None,
backend: Optional[bool] = False,
api_mode: bool = False,
host: Optional[str] = None,
) -> Any:
# Redirect stdout if backend is True
original_stdout = sys.stdout # Save the original standard output
@@ -297,8 +282,6 @@ def start(
test=test,
suite=suite,
cutoff=cutoff,
api_mode=api_mode,
host=host,
)
sys.stdout = original_stdout
@@ -420,5 +403,5 @@ def get_regression_data() -> Any:
# return latest_report
if __name__ == "__main__":
cli()
# if __name__ == "__main__":
# start()

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,30 @@
{
"1": {
"role": "assistant",
"content": "{\n \"thoughts\": {\n \"text\": \"I need to find Tesla's revenue in 2022.\",\n \"reasoning\": \"I will perform a web search to find the information.\",\n \"plan\": \"- Perform a web search\\n- Extract the revenue\\n- Write the revenue to a file\",\n \"criticism\": \"I should be cautious about the reliability of the source I find.\",\n \"speak\": \"I will search the web for Tesla's revenue in 2022.\"\n },\n \"command\": {\n \"name\": \"web_search\",\n \"args\": {\n \"query\": \"Tesla revenue in 2022\"\n }\n }\n}"
},
"2": {
"role": "assistant",
"content": "{\n \"thoughts\": {\n \"text\": \"I found a reliable source for Tesla's revenue in 2022.\",\n \"reasoning\": \"I will use the Statista source to extract the revenue information.\",\n \"plan\": \"- Browse the Statista website\\n- Extract the revenue\\n- Write the revenue to a file\",\n \"criticism\": \"I should verify the information on the Statista website.\",\n \"speak\": \"I will browse the Statista website to find Tesla's revenue in 2022.\"\n },\n \"command\": {\n \"name\": \"browse_website\",\n \"args\": {\n \"url\": \"https://www.statista.com/statistics/272120/revenue-of-tesla/\",\n \"question\": \"What was Tesla's revenue in 2022?\"\n }\n }\n}"
},
"3": {
"role": "assistant",
"content": "Tesla's revenue in 2022 was nearly $81.5 billion, which represents a 51 percent increase from the previous year. The majority of the revenue, over $71 billion, was generated from Tesla's automotive segment. The Model Y emerged as Tesla's best-selling vehicle, and the company has been investing in the production of the Model 3 and Model Y."
},
"4": {
"role": "assistant",
"content": "The text provides various statistics and financial information about Tesla, including its revenue from 2008 to 2022. However, the specific revenue for 2022 is not mentioned in the text."
},
"5": {
"role": "assistant",
"content": "a description of different types of cookies used on a website and the options for accepting or rejecting them. There is no information provided about Tesla's revenue in 2022."
},
"6": {
"role": "assistant",
"content": "Tesla's revenue in 2022 increased by 51% to nearly $81.5 billion, with the majority coming from the automotive segment. The Model Y was the best-selling vehicle, and the company has been focusing on producing the Model 3 and Model Y."
},
"7": {
"role": "assistant",
"content": "{\n \"thoughts\": {\n \"text\": \"I found Tesla's revenue in 2022.\",\n \"reasoning\": \"I will write the revenue to a file.\",\n \"plan\": \"- Write the revenue to a file\",\n \"criticism\": \"I should ensure the revenue is formatted correctly.\",\n \"speak\": \"I will write Tesla's revenue in 2022 to a file.\"\n },\n \"command\": {\n \"name\": \"write_to_file\",\n \"args\": {\n \"filename\": \"tesla_revenue_2022.txt\",\n \"text\": \"Tesla's revenue in 2022 was $81,500,000,000.\"\n }\n }\n}"
}
}

View File

@@ -0,0 +1,74 @@
{
"1": {
"role": "assistant",
"content": {
"thoughts": {
"text": "I need to find Tesla's revenue in 2022.",
"reasoning": "I will perform a web search to find the information.",
"plan": "- Perform a web search\n- Extract the revenue\n- Write the revenue to a file",
"criticism": "I should be cautious about the reliability of the source I find.",
"speak": "I will search the web for Tesla's revenue in 2022."
},
"command": {
"name": "web_search",
"args": {
"query": "Tesla revenue in 2022"
}
}
}
},
"2": {
"role": "assistant",
"content": {
"thoughts": {
"text": "I found a reliable source for Tesla's revenue in 2022.",
"reasoning": "I will use the Statista source to extract the revenue information.",
"plan": "- Browse the Statista website\n- Extract the revenue\n- Write the revenue to a file",
"criticism": "I should verify the information on the Statista website.",
"speak": "I will browse the Statista website to find Tesla's revenue in 2022."
},
"command": {
"name": "browse_website",
"args": {
"url": "https://www.statista.com/statistics/272120/revenue-of-tesla/",
"question": "What was Tesla's revenue in 2022?"
}
}
}
},
"3": {
"role": "assistant",
"content": "Tesla's revenue in 2022 was nearly $81.5 billion, which represents a 51 percent increase from the previous year. The majority of the revenue, over $71 billion, was generated from Tesla's automotive segment. The Model Y emerged as Tesla's best-selling vehicle, and the company has been investing in the production of the Model 3 and Model Y."
},
"4": {
"role": "assistant",
"content": "The text provides various statistics and financial information about Tesla, including its revenue from 2008 to 2022. However, the specific revenue for 2022 is not mentioned in the text."
},
"5": {
"role": "assistant",
"content": "a description of different types of cookies used on a website and the options for accepting or rejecting them. There is no information provided about Tesla's revenue in 2022."
},
"6": {
"role": "assistant",
"content": "Tesla's revenue in 2022 increased by 51% to nearly $81.5 billion, with the majority coming from the automotive segment. The Model Y was the best-selling vehicle, and the company has been focusing on producing the Model 3 and Model Y."
},
"7": {
"role": "assistant",
"content": {
"thoughts": {
"text": "I found Tesla's revenue in 2022.",
"reasoning": "I will write the revenue to a file.",
"plan": "- Write the revenue to a file",
"criticism": "I should ensure the revenue is formatted correctly.",
"speak": "I will write Tesla's revenue in 2022 to a file."
},
"command": {
"name": "write_to_file",
"args": {
"filename": "tesla_revenue_2022.txt",
"text": "Tesla's revenue in 2022 was $81,500,000,000."
}
}
}
}
}

View File

@@ -6,6 +6,8 @@ from gql.transport.aiohttp import AIOHTTPTransport
from gql import gql, Client
import os
from agbenchmark.reports.processing.report_types import Report, SuiteTest
def get_reports():
# Initialize an empty list to store the report data
@@ -22,6 +24,8 @@ def get_reports():
# Iterate over all agent directories in the reports directory
for agent_name in os.listdir(reports_dir):
if agent_name is None:
continue
agent_dir = os.path.join(reports_dir, agent_name)
# Check if the item is a directory (an agent directory)
@@ -40,38 +44,51 @@ def get_reports():
# Open the report.json file
with open(report_file, "r") as f:
# Load the JSON data from the file
report = json.load(f)
json_data = json.load(f)
report = Report.parse_obj(json_data)
# Iterate over all tests in the report
for test_name, test_data in report["tests"].items():
try:
# Append the relevant data to the report_data list
if agent_name is not None:
report_data.append(
{
"agent": agent_name.lower(),
"benchmark_start_time": report[
"benchmark_start_time"
],
"challenge": test_name,
"categories": ", ".join(
test_data["category"]
),
"task": test_data["task"],
"success": test_data["metrics"]["success"],
"difficulty": test_data["metrics"][
"difficulty"
],
"success_%": test_data["metrics"][
"success_%"
],
"run_time": test_data["metrics"][
"run_time"
],
}
)
except KeyError:
pass
for test_name, test_data in report.tests.items():
test_json = {
"agent": agent_name.lower(),
"benchmark_start_time": report.benchmark_start_time
}
if isinstance(test_data, SuiteTest):
if test_data.category: # this means it's a same task test
test_json["challenge"] = test_name
test_json["attempted"] = test_data.tests[list(test_data.tests.keys())[0]].metrics.attempted
test_json["categories"] = ", ".join(test_data.category)
test_json["task"] = test_data.task
test_json["success"] = test_data.metrics.percentage
test_json["difficulty"] = test_data.metrics.highest_difficulty
test_json["success_%"] = test_data.metrics.percentage
test_json["run_time"] = test_data.metrics.run_time
test_json["is_regression"] = test_data.tests[list(test_data.tests.keys())[0]].is_regression
else: # separate tasks in 1 suite
for suite_test_name, suite_data in test_data.tests.items():
test_json["challenge"] = suite_test_name
test_json["attempted"] = suite_data.metrics.attempted
test_json["categories"] = ", ".join(suite_data.category)
test_json["task"] = suite_data.task
test_json["success"] = 100.0 if suite_data.metrics.success else 0
test_json["difficulty"] = suite_data.metrics.difficulty
test_json["success_%"] = suite_data.metrics.success_percent
test_json["run_time"] = suite_data.metrics.run_time
test_json["is_regression"] = suite_data.is_regression
else:
test_json["challenge"] = test_name
test_json["attempted"] = test_data.metrics.attempted
test_json["categories"] = ", ".join(test_data.category)
test_json["task"] = test_data.task
test_json["success"] = 100.0 if test_data.metrics.success else 0
test_json["difficulty"] = test_data.metrics.difficulty
test_json["success_%"] = test_data.metrics.success_percent
test_json["run_time"] = test_data.metrics.run_time
test_json["is_regression"] = test_data.is_regression
report_data.append(test_json)
return pd.DataFrame(report_data)
@@ -100,6 +117,7 @@ def get_helicone_data():
limit: $limit
offset: $offset
) {
costUSD
prompt
properties{
name
@@ -135,10 +153,12 @@ def get_helicone_data():
{
"createdAt": item["createdAt"],
"agent": properties.get("agent"),
"costUSD": item["costUSD"],
"job_id": properties.get("job_id"),
"challenge": properties.get("challenge"),
"benchmark_start_time": properties.get("benchmark_start_time"),
"prompt": item["prompt"],
"response": item["response"],
"model": item["requestBody"].get("model"),
"request": item["requestBody"].get("messages"),
}
@@ -158,14 +178,14 @@ def get_helicone_data():
return df
if os.path.exists("reports_raw.pkl") and os.path.exists("helicone_raw.pkl"):
reports_df = pd.read_pickle("reports_raw.pkl")
helicone_df = pd.read_pickle("helicone_raw.pkl")
if os.path.exists("raw_reports.pkl") and os.path.exists("raw_helicone.pkl"):
reports_df = pd.read_pickle("raw_reports.pkl")
helicone_df = pd.read_pickle("raw_helicone.pkl")
else:
reports_df = get_reports()
reports_df.to_pickle("reports_raw.pkl")
reports_df.to_pickle("raw_reports.pkl")
helicone_df = get_helicone_data()
helicone_df.to_pickle("helicone_raw.pkl")
helicone_df.to_pickle("raw_helicone.pkl")
def try_formats(date_str):
@@ -199,13 +219,20 @@ assert pd.api.types.is_datetime64_any_dtype(
reports_df["report_time"] = reports_df["benchmark_start_time"]
df = pd.merge_asof(
helicone_df.sort_values("benchmark_start_time"),
reports_df.sort_values("benchmark_start_time"),
left_on="benchmark_start_time",
right_on="benchmark_start_time",
by=["agent", "challenge"],
direction="backward",
# df = pd.merge_asof(
# helicone_df.sort_values("benchmark_start_time"),
# reports_df.sort_values("benchmark_start_time"),
# left_on="benchmark_start_time",
# right_on="benchmark_start_time",
# by=["agent", "challenge"],
# direction="backward",
# )
df = pd.merge(
helicone_df,
reports_df,
on=["benchmark_start_time", "agent", "challenge"],
how="left",
)
df.to_pickle("df.pkl")