mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-09 15:17:59 -05:00
combined reports with json (#349)
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,8 +1,7 @@
|
||||
agbenchmark/workspace/
|
||||
backend/backend_stdout.txt
|
||||
reports/df.pkl
|
||||
reports/df_backwards.pkl
|
||||
reports/reports_raw.pkl
|
||||
reports/df*.pkl
|
||||
reports/raw*
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
@@ -235,9 +235,6 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
||||
json_file = (
|
||||
json_files.popleft()
|
||||
) # Take and remove the first element from json_files
|
||||
|
||||
if challenge_should_be_ignored(json_file):
|
||||
continue
|
||||
data = ChallengeData.get_json_from_path(json_file)
|
||||
suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))
|
||||
|
||||
@@ -296,8 +293,4 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
||||
print(f"Generated test for {data['name']}.")
|
||||
|
||||
|
||||
def challenge_should_be_ignored(json_file):
|
||||
return "challenges/deprecated" in json_file or "challenges/library" in json_file
|
||||
|
||||
|
||||
generate_tests()
|
||||
|
||||
@@ -9,6 +9,7 @@ class Metrics(BaseModel):
|
||||
success_percent: float = Field(..., alias="success_%")
|
||||
run_time: Optional[str] = None
|
||||
fail_reason: Optional[str] = None
|
||||
attempted: Optional[bool] = None
|
||||
|
||||
|
||||
class MetricsOverall(BaseModel):
|
||||
|
||||
@@ -95,8 +95,7 @@ def run_benchmark(
|
||||
test: Optional[str] = None,
|
||||
suite: Optional[str] = None,
|
||||
cutoff: Optional[int] = None,
|
||||
api_mode: bool = False,
|
||||
host: Optional[str] = None,
|
||||
server: bool = False,
|
||||
) -> int:
|
||||
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
|
||||
# Check if configuration file exists and is not empty
|
||||
@@ -133,12 +132,7 @@ def run_benchmark(
|
||||
config = json.load(f)
|
||||
else:
|
||||
config = {}
|
||||
host = host or config.get("host")
|
||||
api_mode = api_mode or config.get("api_mode")
|
||||
if host:
|
||||
config["host"] = host
|
||||
if api_mode:
|
||||
config["api_mode"] = api_mode
|
||||
|
||||
print("benchmark run path", CONFIG_PATH, HOME_DIRECTORY)
|
||||
if not config.get("workspace"):
|
||||
config["workspace"] = click.prompt(
|
||||
@@ -147,7 +141,7 @@ def run_benchmark(
|
||||
show_default=True,
|
||||
)
|
||||
|
||||
if api_mode and not host:
|
||||
if config.get("api_mode") and not config.get("host"):
|
||||
config["host"] = click.prompt(
|
||||
"Please enter the Agent API host address",
|
||||
default="http://localhost:8000",
|
||||
@@ -201,10 +195,7 @@ def run_benchmark(
|
||||
elif explore:
|
||||
print("Only attempt challenges that have never been beaten")
|
||||
pytest_args.append("--explore")
|
||||
if host:
|
||||
pytest_args.append(f"--host={host}")
|
||||
if api_mode:
|
||||
pytest_args.append("--api_mode")
|
||||
|
||||
if mock:
|
||||
pytest_args.append("--mock")
|
||||
|
||||
@@ -224,8 +215,6 @@ def run_benchmark(
|
||||
print(f"Setting cuttoff override to {cutoff} seconds.")
|
||||
|
||||
pytest_args.extend((str(CURRENT_DIRECTORY), "--cache-clear"))
|
||||
pytest_args.append("--disable-warnings")
|
||||
|
||||
return pytest.main(pytest_args)
|
||||
|
||||
|
||||
@@ -260,8 +249,6 @@ def cli() -> None:
|
||||
)
|
||||
@click.option("--nc", is_flag=True, help="Run without cutoff")
|
||||
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
|
||||
@click.option("--api_mode", help="API mode")
|
||||
@click.option("--host", help="Define API host")
|
||||
def start(
|
||||
maintain: bool,
|
||||
improve: bool,
|
||||
@@ -275,8 +262,6 @@ def start(
|
||||
suite: Optional[str] = None,
|
||||
cutoff: Optional[int] = None,
|
||||
backend: Optional[bool] = False,
|
||||
api_mode: bool = False,
|
||||
host: Optional[str] = None,
|
||||
) -> Any:
|
||||
# Redirect stdout if backend is True
|
||||
original_stdout = sys.stdout # Save the original standard output
|
||||
@@ -297,8 +282,6 @@ def start(
|
||||
test=test,
|
||||
suite=suite,
|
||||
cutoff=cutoff,
|
||||
api_mode=api_mode,
|
||||
host=host,
|
||||
)
|
||||
|
||||
sys.stdout = original_stdout
|
||||
@@ -420,5 +403,5 @@ def get_regression_data() -> Any:
|
||||
# return latest_report
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
# if __name__ == "__main__":
|
||||
# start()
|
||||
|
||||
685
notebooks/combined_data.ipynb
Normal file
685
notebooks/combined_data.ipynb
Normal file
File diff suppressed because one or more lines are too long
30
notebooks/selected_logs.json
Normal file
30
notebooks/selected_logs.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"1": {
|
||||
"role": "assistant",
|
||||
"content": "{\n \"thoughts\": {\n \"text\": \"I need to find Tesla's revenue in 2022.\",\n \"reasoning\": \"I will perform a web search to find the information.\",\n \"plan\": \"- Perform a web search\\n- Extract the revenue\\n- Write the revenue to a file\",\n \"criticism\": \"I should be cautious about the reliability of the source I find.\",\n \"speak\": \"I will search the web for Tesla's revenue in 2022.\"\n },\n \"command\": {\n \"name\": \"web_search\",\n \"args\": {\n \"query\": \"Tesla revenue in 2022\"\n }\n }\n}"
|
||||
},
|
||||
"2": {
|
||||
"role": "assistant",
|
||||
"content": "{\n \"thoughts\": {\n \"text\": \"I found a reliable source for Tesla's revenue in 2022.\",\n \"reasoning\": \"I will use the Statista source to extract the revenue information.\",\n \"plan\": \"- Browse the Statista website\\n- Extract the revenue\\n- Write the revenue to a file\",\n \"criticism\": \"I should verify the information on the Statista website.\",\n \"speak\": \"I will browse the Statista website to find Tesla's revenue in 2022.\"\n },\n \"command\": {\n \"name\": \"browse_website\",\n \"args\": {\n \"url\": \"https://www.statista.com/statistics/272120/revenue-of-tesla/\",\n \"question\": \"What was Tesla's revenue in 2022?\"\n }\n }\n}"
|
||||
},
|
||||
"3": {
|
||||
"role": "assistant",
|
||||
"content": "Tesla's revenue in 2022 was nearly $81.5 billion, which represents a 51 percent increase from the previous year. The majority of the revenue, over $71 billion, was generated from Tesla's automotive segment. The Model Y emerged as Tesla's best-selling vehicle, and the company has been investing in the production of the Model 3 and Model Y."
|
||||
},
|
||||
"4": {
|
||||
"role": "assistant",
|
||||
"content": "The text provides various statistics and financial information about Tesla, including its revenue from 2008 to 2022. However, the specific revenue for 2022 is not mentioned in the text."
|
||||
},
|
||||
"5": {
|
||||
"role": "assistant",
|
||||
"content": "a description of different types of cookies used on a website and the options for accepting or rejecting them. There is no information provided about Tesla's revenue in 2022."
|
||||
},
|
||||
"6": {
|
||||
"role": "assistant",
|
||||
"content": "Tesla's revenue in 2022 increased by 51% to nearly $81.5 billion, with the majority coming from the automotive segment. The Model Y was the best-selling vehicle, and the company has been focusing on producing the Model 3 and Model Y."
|
||||
},
|
||||
"7": {
|
||||
"role": "assistant",
|
||||
"content": "{\n \"thoughts\": {\n \"text\": \"I found Tesla's revenue in 2022.\",\n \"reasoning\": \"I will write the revenue to a file.\",\n \"plan\": \"- Write the revenue to a file\",\n \"criticism\": \"I should ensure the revenue is formatted correctly.\",\n \"speak\": \"I will write Tesla's revenue in 2022 to a file.\"\n },\n \"command\": {\n \"name\": \"write_to_file\",\n \"args\": {\n \"filename\": \"tesla_revenue_2022.txt\",\n \"text\": \"Tesla's revenue in 2022 was $81,500,000,000.\"\n }\n }\n}"
|
||||
}
|
||||
}
|
||||
74
notebooks/selected_logs_nested.json
Normal file
74
notebooks/selected_logs_nested.json
Normal file
@@ -0,0 +1,74 @@
|
||||
{
|
||||
"1": {
|
||||
"role": "assistant",
|
||||
"content": {
|
||||
"thoughts": {
|
||||
"text": "I need to find Tesla's revenue in 2022.",
|
||||
"reasoning": "I will perform a web search to find the information.",
|
||||
"plan": "- Perform a web search\n- Extract the revenue\n- Write the revenue to a file",
|
||||
"criticism": "I should be cautious about the reliability of the source I find.",
|
||||
"speak": "I will search the web for Tesla's revenue in 2022."
|
||||
},
|
||||
"command": {
|
||||
"name": "web_search",
|
||||
"args": {
|
||||
"query": "Tesla revenue in 2022"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"2": {
|
||||
"role": "assistant",
|
||||
"content": {
|
||||
"thoughts": {
|
||||
"text": "I found a reliable source for Tesla's revenue in 2022.",
|
||||
"reasoning": "I will use the Statista source to extract the revenue information.",
|
||||
"plan": "- Browse the Statista website\n- Extract the revenue\n- Write the revenue to a file",
|
||||
"criticism": "I should verify the information on the Statista website.",
|
||||
"speak": "I will browse the Statista website to find Tesla's revenue in 2022."
|
||||
},
|
||||
"command": {
|
||||
"name": "browse_website",
|
||||
"args": {
|
||||
"url": "https://www.statista.com/statistics/272120/revenue-of-tesla/",
|
||||
"question": "What was Tesla's revenue in 2022?"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"3": {
|
||||
"role": "assistant",
|
||||
"content": "Tesla's revenue in 2022 was nearly $81.5 billion, which represents a 51 percent increase from the previous year. The majority of the revenue, over $71 billion, was generated from Tesla's automotive segment. The Model Y emerged as Tesla's best-selling vehicle, and the company has been investing in the production of the Model 3 and Model Y."
|
||||
},
|
||||
"4": {
|
||||
"role": "assistant",
|
||||
"content": "The text provides various statistics and financial information about Tesla, including its revenue from 2008 to 2022. However, the specific revenue for 2022 is not mentioned in the text."
|
||||
},
|
||||
"5": {
|
||||
"role": "assistant",
|
||||
"content": "a description of different types of cookies used on a website and the options for accepting or rejecting them. There is no information provided about Tesla's revenue in 2022."
|
||||
},
|
||||
"6": {
|
||||
"role": "assistant",
|
||||
"content": "Tesla's revenue in 2022 increased by 51% to nearly $81.5 billion, with the majority coming from the automotive segment. The Model Y was the best-selling vehicle, and the company has been focusing on producing the Model 3 and Model Y."
|
||||
},
|
||||
"7": {
|
||||
"role": "assistant",
|
||||
"content": {
|
||||
"thoughts": {
|
||||
"text": "I found Tesla's revenue in 2022.",
|
||||
"reasoning": "I will write the revenue to a file.",
|
||||
"plan": "- Write the revenue to a file",
|
||||
"criticism": "I should ensure the revenue is formatted correctly.",
|
||||
"speak": "I will write Tesla's revenue in 2022 to a file."
|
||||
},
|
||||
"command": {
|
||||
"name": "write_to_file",
|
||||
"args": {
|
||||
"filename": "tesla_revenue_2022.txt",
|
||||
"text": "Tesla's revenue in 2022 was $81,500,000,000."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,8 @@ from gql.transport.aiohttp import AIOHTTPTransport
|
||||
from gql import gql, Client
|
||||
import os
|
||||
|
||||
from agbenchmark.reports.processing.report_types import Report, SuiteTest
|
||||
|
||||
|
||||
def get_reports():
|
||||
# Initialize an empty list to store the report data
|
||||
@@ -22,6 +24,8 @@ def get_reports():
|
||||
|
||||
# Iterate over all agent directories in the reports directory
|
||||
for agent_name in os.listdir(reports_dir):
|
||||
if agent_name is None:
|
||||
continue
|
||||
agent_dir = os.path.join(reports_dir, agent_name)
|
||||
|
||||
# Check if the item is a directory (an agent directory)
|
||||
@@ -40,38 +44,51 @@ def get_reports():
|
||||
# Open the report.json file
|
||||
with open(report_file, "r") as f:
|
||||
# Load the JSON data from the file
|
||||
report = json.load(f)
|
||||
json_data = json.load(f)
|
||||
report = Report.parse_obj(json_data)
|
||||
|
||||
# Iterate over all tests in the report
|
||||
for test_name, test_data in report["tests"].items():
|
||||
try:
|
||||
# Append the relevant data to the report_data list
|
||||
if agent_name is not None:
|
||||
report_data.append(
|
||||
{
|
||||
"agent": agent_name.lower(),
|
||||
"benchmark_start_time": report[
|
||||
"benchmark_start_time"
|
||||
],
|
||||
"challenge": test_name,
|
||||
"categories": ", ".join(
|
||||
test_data["category"]
|
||||
),
|
||||
"task": test_data["task"],
|
||||
"success": test_data["metrics"]["success"],
|
||||
"difficulty": test_data["metrics"][
|
||||
"difficulty"
|
||||
],
|
||||
"success_%": test_data["metrics"][
|
||||
"success_%"
|
||||
],
|
||||
"run_time": test_data["metrics"][
|
||||
"run_time"
|
||||
],
|
||||
}
|
||||
)
|
||||
except KeyError:
|
||||
pass
|
||||
for test_name, test_data in report.tests.items():
|
||||
test_json = {
|
||||
"agent": agent_name.lower(),
|
||||
"benchmark_start_time": report.benchmark_start_time
|
||||
}
|
||||
|
||||
if isinstance(test_data, SuiteTest):
|
||||
if test_data.category: # this means it's a same task test
|
||||
test_json["challenge"] = test_name
|
||||
test_json["attempted"] = test_data.tests[list(test_data.tests.keys())[0]].metrics.attempted
|
||||
test_json["categories"] = ", ".join(test_data.category)
|
||||
test_json["task"] = test_data.task
|
||||
test_json["success"] = test_data.metrics.percentage
|
||||
test_json["difficulty"] = test_data.metrics.highest_difficulty
|
||||
test_json["success_%"] = test_data.metrics.percentage
|
||||
test_json["run_time"] = test_data.metrics.run_time
|
||||
test_json["is_regression"] = test_data.tests[list(test_data.tests.keys())[0]].is_regression
|
||||
else: # separate tasks in 1 suite
|
||||
for suite_test_name, suite_data in test_data.tests.items():
|
||||
test_json["challenge"] = suite_test_name
|
||||
test_json["attempted"] = suite_data.metrics.attempted
|
||||
test_json["categories"] = ", ".join(suite_data.category)
|
||||
test_json["task"] = suite_data.task
|
||||
test_json["success"] = 100.0 if suite_data.metrics.success else 0
|
||||
test_json["difficulty"] = suite_data.metrics.difficulty
|
||||
test_json["success_%"] = suite_data.metrics.success_percent
|
||||
test_json["run_time"] = suite_data.metrics.run_time
|
||||
test_json["is_regression"] = suite_data.is_regression
|
||||
|
||||
else:
|
||||
test_json["challenge"] = test_name
|
||||
test_json["attempted"] = test_data.metrics.attempted
|
||||
test_json["categories"] = ", ".join(test_data.category)
|
||||
test_json["task"] = test_data.task
|
||||
test_json["success"] = 100.0 if test_data.metrics.success else 0
|
||||
test_json["difficulty"] = test_data.metrics.difficulty
|
||||
test_json["success_%"] = test_data.metrics.success_percent
|
||||
test_json["run_time"] = test_data.metrics.run_time
|
||||
test_json["is_regression"] = test_data.is_regression
|
||||
|
||||
report_data.append(test_json)
|
||||
|
||||
return pd.DataFrame(report_data)
|
||||
|
||||
|
||||
@@ -100,6 +117,7 @@ def get_helicone_data():
|
||||
limit: $limit
|
||||
offset: $offset
|
||||
) {
|
||||
costUSD
|
||||
prompt
|
||||
properties{
|
||||
name
|
||||
@@ -135,10 +153,12 @@ def get_helicone_data():
|
||||
{
|
||||
"createdAt": item["createdAt"],
|
||||
"agent": properties.get("agent"),
|
||||
"costUSD": item["costUSD"],
|
||||
"job_id": properties.get("job_id"),
|
||||
"challenge": properties.get("challenge"),
|
||||
"benchmark_start_time": properties.get("benchmark_start_time"),
|
||||
"prompt": item["prompt"],
|
||||
"response": item["response"],
|
||||
"model": item["requestBody"].get("model"),
|
||||
"request": item["requestBody"].get("messages"),
|
||||
}
|
||||
@@ -158,14 +178,14 @@ def get_helicone_data():
|
||||
return df
|
||||
|
||||
|
||||
if os.path.exists("reports_raw.pkl") and os.path.exists("helicone_raw.pkl"):
|
||||
reports_df = pd.read_pickle("reports_raw.pkl")
|
||||
helicone_df = pd.read_pickle("helicone_raw.pkl")
|
||||
if os.path.exists("raw_reports.pkl") and os.path.exists("raw_helicone.pkl"):
|
||||
reports_df = pd.read_pickle("raw_reports.pkl")
|
||||
helicone_df = pd.read_pickle("raw_helicone.pkl")
|
||||
else:
|
||||
reports_df = get_reports()
|
||||
reports_df.to_pickle("reports_raw.pkl")
|
||||
reports_df.to_pickle("raw_reports.pkl")
|
||||
helicone_df = get_helicone_data()
|
||||
helicone_df.to_pickle("helicone_raw.pkl")
|
||||
helicone_df.to_pickle("raw_helicone.pkl")
|
||||
|
||||
|
||||
def try_formats(date_str):
|
||||
@@ -199,13 +219,20 @@ assert pd.api.types.is_datetime64_any_dtype(
|
||||
|
||||
reports_df["report_time"] = reports_df["benchmark_start_time"]
|
||||
|
||||
df = pd.merge_asof(
|
||||
helicone_df.sort_values("benchmark_start_time"),
|
||||
reports_df.sort_values("benchmark_start_time"),
|
||||
left_on="benchmark_start_time",
|
||||
right_on="benchmark_start_time",
|
||||
by=["agent", "challenge"],
|
||||
direction="backward",
|
||||
# df = pd.merge_asof(
|
||||
# helicone_df.sort_values("benchmark_start_time"),
|
||||
# reports_df.sort_values("benchmark_start_time"),
|
||||
# left_on="benchmark_start_time",
|
||||
# right_on="benchmark_start_time",
|
||||
# by=["agent", "challenge"],
|
||||
# direction="backward",
|
||||
# )
|
||||
|
||||
df = pd.merge(
|
||||
helicone_df,
|
||||
reports_df,
|
||||
on=["benchmark_start_time", "agent", "challenge"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
df.to_pickle("df.pkl")
|
||||
|
||||
Reference in New Issue
Block a user