combined reports with json (#349)

2026-01-09 15:17:59 -05:00 · 2023-09-02 22:28:47 -07:00
parent 4c236b16e4
commit cd7c6139c1
8 changed files with 868 additions and 76 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,7 @@
 agbenchmark/workspace/
 backend/backend_stdout.txt
-reports/df.pkl
-reports/df_backwards.pkl
-reports/reports_raw.pkl
+reports/df*.pkl
+reports/raw*

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/agbenchmark/generate_test.py
+++ b/agbenchmark/generate_test.py
@@ -235,9 +235,6 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
        json_file = (
            json_files.popleft()
        )  # Take and remove the first element from json_files
-
-        if challenge_should_be_ignored(json_file):
-            continue
        data = ChallengeData.get_json_from_path(json_file)
        suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))

@@ -296,8 +293,4 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
            print(f"Generated test for {data['name']}.")


-def challenge_should_be_ignored(json_file):
-    return "challenges/deprecated" in json_file or "challenges/library" in json_file
-
-
 generate_tests()
--- a/agbenchmark/reports/processing/report_types.py
+++ b/agbenchmark/reports/processing/report_types.py
@@ -9,6 +9,7 @@ class Metrics(BaseModel):
    success_percent: float = Field(..., alias="success_%")
    run_time: Optional[str] = None
    fail_reason: Optional[str] = None
+    attempted: Optional[bool] = None


 class MetricsOverall(BaseModel):
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -95,8 +95,7 @@ def run_benchmark(
    test: Optional[str] = None,
    suite: Optional[str] = None,
    cutoff: Optional[int] = None,
-    api_mode: bool = False,
-    host: Optional[str] = None,
+    server: bool = False,
 ) -> int:
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
    # Check if configuration file exists and is not empty
@@ -133,12 +132,7 @@ def run_benchmark(
            config = json.load(f)
    else:
        config = {}
-    host = host or config.get("host")
-    api_mode = api_mode or config.get("api_mode")
-    if host:
-        config["host"] = host
-    if api_mode:
-        config["api_mode"] = api_mode
+
    print("benchmark run path", CONFIG_PATH, HOME_DIRECTORY)
    if not config.get("workspace"):
        config["workspace"] = click.prompt(
@@ -147,7 +141,7 @@ def run_benchmark(
            show_default=True,
        )

-    if api_mode and not host:
+    if config.get("api_mode") and not config.get("host"):
        config["host"] = click.prompt(
            "Please enter the Agent API host address",
            default="http://localhost:8000",
@@ -201,10 +195,7 @@ def run_benchmark(
        elif explore:
            print("Only attempt challenges that have never been beaten")
            pytest_args.append("--explore")
-    if host:
-        pytest_args.append(f"--host={host}")
-    if api_mode:
-        pytest_args.append("--api_mode")
+
    if mock:
        pytest_args.append("--mock")

@@ -224,8 +215,6 @@ def run_benchmark(
        print(f"Setting cuttoff override to {cutoff} seconds.")

    pytest_args.extend((str(CURRENT_DIRECTORY), "--cache-clear"))
-    pytest_args.append("--disable-warnings")
-
    return pytest.main(pytest_args)


@@ -260,8 +249,6 @@ def cli() -> None:
 )
@click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
-@click.option("--api_mode", help="API mode")
-@click.option("--host", help="Define API host")
 def start(
    maintain: bool,
    improve: bool,
@@ -275,8 +262,6 @@ def start(
    suite: Optional[str] = None,
    cutoff: Optional[int] = None,
    backend: Optional[bool] = False,
-    api_mode: bool = False,
-    host: Optional[str] = None,
 ) -> Any:
    # Redirect stdout if backend is True
    original_stdout = sys.stdout  # Save the original standard output
@@ -297,8 +282,6 @@ def start(
                test=test,
                suite=suite,
                cutoff=cutoff,
-                api_mode=api_mode,
-                host=host,
            )

        sys.stdout = original_stdout
@@ -420,5 +403,5 @@ def get_regression_data() -> Any:
 #     return latest_report


-if __name__ == "__main__":
-    cli()
+# if __name__ == "__main__":
+#     start()
--- a/notebooks/combined_data.ipynb
+++ b/notebooks/combined_data.ipynb
--- a/notebooks/selected_logs.json
+++ b/notebooks/selected_logs.json
@@ -0,0 +1,30 @@
+{
+    "1": {
+        "role": "assistant",
+        "content": "{\n  \"thoughts\": {\n    \"text\": \"I need to find Tesla's revenue in 2022.\",\n    \"reasoning\": \"I will perform a web search to find the information.\",\n    \"plan\": \"- Perform a web search\\n- Extract the revenue\\n- Write the revenue to a file\",\n    \"criticism\": \"I should be cautious about the reliability of the source I find.\",\n    \"speak\": \"I will search the web for Tesla's revenue in 2022.\"\n  },\n  \"command\": {\n    \"name\": \"web_search\",\n    \"args\": {\n      \"query\": \"Tesla revenue in 2022\"\n    }\n  }\n}"
+    },
+    "2": {
+        "role": "assistant",
+        "content": "{\n  \"thoughts\": {\n    \"text\": \"I found a reliable source for Tesla's revenue in 2022.\",\n    \"reasoning\": \"I will use the Statista source to extract the revenue information.\",\n    \"plan\": \"- Browse the Statista website\\n- Extract the revenue\\n- Write the revenue to a file\",\n    \"criticism\": \"I should verify the information on the Statista website.\",\n    \"speak\": \"I will browse the Statista website to find Tesla's revenue in 2022.\"\n  },\n  \"command\": {\n    \"name\": \"browse_website\",\n    \"args\": {\n      \"url\": \"https://www.statista.com/statistics/272120/revenue-of-tesla/\",\n      \"question\": \"What was Tesla's revenue in 2022?\"\n    }\n  }\n}"
+    },
+    "3": {
+        "role": "assistant",
+        "content": "Tesla's revenue in 2022 was nearly $81.5 billion, which represents a 51 percent increase from the previous year. The majority of the revenue, over $71 billion, was generated from Tesla's automotive segment. The Model Y emerged as Tesla's best-selling vehicle, and the company has been investing in the production of the Model 3 and Model Y."
+    },
+    "4": {
+        "role": "assistant",
+        "content": "The text provides various statistics and financial information about Tesla, including its revenue from 2008 to 2022. However, the specific revenue for 2022 is not mentioned in the text."
+    },
+    "5": {
+        "role": "assistant",
+        "content": "a description of different types of cookies used on a website and the options for accepting or rejecting them. There is no information provided about Tesla's revenue in 2022."
+    },
+    "6": {
+        "role": "assistant",
+        "content": "Tesla's revenue in 2022 increased by 51% to nearly $81.5 billion, with the majority coming from the automotive segment. The Model Y was the best-selling vehicle, and the company has been focusing on producing the Model 3 and Model Y."
+    },
+    "7": {
+        "role": "assistant",
+        "content": "{\n  \"thoughts\": {\n    \"text\": \"I found Tesla's revenue in 2022.\",\n    \"reasoning\": \"I will write the revenue to a file.\",\n    \"plan\": \"- Write the revenue to a file\",\n    \"criticism\": \"I should ensure the revenue is formatted correctly.\",\n    \"speak\": \"I will write Tesla's revenue in 2022 to a file.\"\n  },\n  \"command\": {\n    \"name\": \"write_to_file\",\n    \"args\": {\n      \"filename\": \"tesla_revenue_2022.txt\",\n      \"text\": \"Tesla's revenue in 2022 was $81,500,000,000.\"\n    }\n  }\n}"
+    }
+}
--- a/notebooks/selected_logs_nested.json
+++ b/notebooks/selected_logs_nested.json
@@ -0,0 +1,74 @@
+{
+    "1": {
+        "role": "assistant",
+        "content": {
+            "thoughts": {
+                "text": "I need to find Tesla's revenue in 2022.",
+                "reasoning": "I will perform a web search to find the information.",
+                "plan": "- Perform a web search\n- Extract the revenue\n- Write the revenue to a file",
+                "criticism": "I should be cautious about the reliability of the source I find.",
+                "speak": "I will search the web for Tesla's revenue in 2022."
+            },
+            "command": {
+                "name": "web_search",
+                "args": {
+                    "query": "Tesla revenue in 2022"
+                }
+            }
+        }
+    },
+    "2": {
+        "role": "assistant",
+        "content": {
+            "thoughts": {
+                "text": "I found a reliable source for Tesla's revenue in 2022.",
+                "reasoning": "I will use the Statista source to extract the revenue information.",
+                "plan": "- Browse the Statista website\n- Extract the revenue\n- Write the revenue to a file",
+                "criticism": "I should verify the information on the Statista website.",
+                "speak": "I will browse the Statista website to find Tesla's revenue in 2022."
+            },
+            "command": {
+                "name": "browse_website",
+                "args": {
+                    "url": "https://www.statista.com/statistics/272120/revenue-of-tesla/",
+                    "question": "What was Tesla's revenue in 2022?"
+                }
+            }
+        }
+    },
+    "3": {
+        "role": "assistant",
+        "content": "Tesla's revenue in 2022 was nearly $81.5 billion, which represents a 51 percent increase from the previous year. The majority of the revenue, over $71 billion, was generated from Tesla's automotive segment. The Model Y emerged as Tesla's best-selling vehicle, and the company has been investing in the production of the Model 3 and Model Y."
+    },
+    "4": {
+        "role": "assistant",
+        "content": "The text provides various statistics and financial information about Tesla, including its revenue from 2008 to 2022. However, the specific revenue for 2022 is not mentioned in the text."
+    },
+    "5": {
+        "role": "assistant",
+        "content": "a description of different types of cookies used on a website and the options for accepting or rejecting them. There is no information provided about Tesla's revenue in 2022."
+    },
+    "6": {
+        "role": "assistant",
+        "content": "Tesla's revenue in 2022 increased by 51% to nearly $81.5 billion, with the majority coming from the automotive segment. The Model Y was the best-selling vehicle, and the company has been focusing on producing the Model 3 and Model Y."
+    },
+    "7": {
+        "role": "assistant",
+        "content": {
+            "thoughts": {
+                "text": "I found Tesla's revenue in 2022.",
+                "reasoning": "I will write the revenue to a file.",
+                "plan": "- Write the revenue to a file",
+                "criticism": "I should ensure the revenue is formatted correctly.",
+                "speak": "I will write Tesla's revenue in 2022 to a file."
+            },
+            "command": {
+                "name": "write_to_file",
+                "args": {
+                    "filename": "tesla_revenue_2022.txt",
+                    "text": "Tesla's revenue in 2022 was $81,500,000,000."
+                }
+            }
+        }
+    }
+}
--- a/reports/match_records.py
+++ b/reports/match_records.py
@@ -6,6 +6,8 @@ from gql.transport.aiohttp import AIOHTTPTransport
 from gql import gql, Client
 import os

+from agbenchmark.reports.processing.report_types import Report, SuiteTest
+

 def get_reports():
    # Initialize an empty list to store the report data
@@ -22,6 +24,8 @@ def get_reports():

    # Iterate over all agent directories in the reports directory
    for agent_name in os.listdir(reports_dir):
+        if agent_name is None:
+            continue
        agent_dir = os.path.join(reports_dir, agent_name)

        # Check if the item is a directory (an agent directory)
@@ -40,38 +44,51 @@ def get_reports():
                    # Open the report.json file
                    with open(report_file, "r") as f:
                        # Load the JSON data from the file
-                        report = json.load(f)
+                        json_data = json.load(f)
+                        report = Report.parse_obj(json_data)

-                        # Iterate over all tests in the report
-                        for test_name, test_data in report["tests"].items():
-                            try:
-                                # Append the relevant data to the report_data list
-                                if agent_name is not None:
-                                    report_data.append(
-                                        {
-                                            "agent": agent_name.lower(),
-                                            "benchmark_start_time": report[
-                                                "benchmark_start_time"
-                                            ],
-                                            "challenge": test_name,
-                                            "categories": ", ".join(
-                                                test_data["category"]
-                                            ),
-                                            "task": test_data["task"],
-                                            "success": test_data["metrics"]["success"],
-                                            "difficulty": test_data["metrics"][
-                                                "difficulty"
-                                            ],
-                                            "success_%": test_data["metrics"][
-                                                "success_%"
-                                            ],
-                                            "run_time": test_data["metrics"][
-                                                "run_time"
-                                            ],
-                                        }
-                                    )
-                            except KeyError:
-                                pass
+                        for test_name, test_data in report.tests.items():
+                            test_json = {
+                                "agent": agent_name.lower(),
+                                "benchmark_start_time": report.benchmark_start_time
+                            }
+
+                            if isinstance(test_data, SuiteTest):
+                                if test_data.category: # this means it's a same task test
+                                    test_json["challenge"] = test_name
+                                    test_json["attempted"] = test_data.tests[list(test_data.tests.keys())[0]].metrics.attempted
+                                    test_json["categories"] = ", ".join(test_data.category)
+                                    test_json["task"] = test_data.task
+                                    test_json["success"] = test_data.metrics.percentage
+                                    test_json["difficulty"] = test_data.metrics.highest_difficulty
+                                    test_json["success_%"] = test_data.metrics.percentage
+                                    test_json["run_time"] = test_data.metrics.run_time
+                                    test_json["is_regression"] = test_data.tests[list(test_data.tests.keys())[0]].is_regression
+                                else: # separate tasks in 1 suite
+                                    for suite_test_name, suite_data in test_data.tests.items():
+                                        test_json["challenge"] = suite_test_name
+                                        test_json["attempted"] = suite_data.metrics.attempted
+                                        test_json["categories"] = ", ".join(suite_data.category)
+                                        test_json["task"] = suite_data.task
+                                        test_json["success"] = 100.0 if suite_data.metrics.success else 0
+                                        test_json["difficulty"] = suite_data.metrics.difficulty
+                                        test_json["success_%"] = suite_data.metrics.success_percent
+                                        test_json["run_time"] = suite_data.metrics.run_time
+                                        test_json["is_regression"] = suite_data.is_regression
+                                
+                            else:
+                                test_json["challenge"] = test_name
+                                test_json["attempted"] = test_data.metrics.attempted
+                                test_json["categories"] = ", ".join(test_data.category)
+                                test_json["task"] = test_data.task
+                                test_json["success"] = 100.0 if test_data.metrics.success else 0
+                                test_json["difficulty"] = test_data.metrics.difficulty
+                                test_json["success_%"] = test_data.metrics.success_percent
+                                test_json["run_time"] = test_data.metrics.run_time
+                                test_json["is_regression"] = test_data.is_regression
+                                
+                            report_data.append(test_json)
+                                
    return pd.DataFrame(report_data)


@@ -100,6 +117,7 @@ def get_helicone_data():
                    limit: $limit
                    offset: $offset
                ) {
+                    costUSD
                    prompt
                    properties{
                        name
@@ -135,10 +153,12 @@ def get_helicone_data():
                    {
                        "createdAt": item["createdAt"],
                        "agent": properties.get("agent"),
+                        "costUSD": item["costUSD"],
                        "job_id": properties.get("job_id"),
                        "challenge": properties.get("challenge"),
                        "benchmark_start_time": properties.get("benchmark_start_time"),
                        "prompt": item["prompt"],
+                        "response": item["response"],
                        "model": item["requestBody"].get("model"),
                        "request": item["requestBody"].get("messages"),
                    }
@@ -158,14 +178,14 @@ def get_helicone_data():
    return df


-if os.path.exists("reports_raw.pkl") and os.path.exists("helicone_raw.pkl"):
-    reports_df = pd.read_pickle("reports_raw.pkl")
-    helicone_df = pd.read_pickle("helicone_raw.pkl")
+if os.path.exists("raw_reports.pkl") and os.path.exists("raw_helicone.pkl"):
+    reports_df = pd.read_pickle("raw_reports.pkl")
+    helicone_df = pd.read_pickle("raw_helicone.pkl")
 else:
    reports_df = get_reports()
-    reports_df.to_pickle("reports_raw.pkl")
+    reports_df.to_pickle("raw_reports.pkl")
    helicone_df = get_helicone_data()
-    helicone_df.to_pickle("helicone_raw.pkl")
+    helicone_df.to_pickle("raw_helicone.pkl")


 def try_formats(date_str):
@@ -199,13 +219,20 @@ assert pd.api.types.is_datetime64_any_dtype(

 reports_df["report_time"] = reports_df["benchmark_start_time"]

-df = pd.merge_asof(
-    helicone_df.sort_values("benchmark_start_time"),
-    reports_df.sort_values("benchmark_start_time"),
-    left_on="benchmark_start_time",
-    right_on="benchmark_start_time",
-    by=["agent", "challenge"],
-    direction="backward",
+# df = pd.merge_asof(
+#     helicone_df.sort_values("benchmark_start_time"),
+#     reports_df.sort_values("benchmark_start_time"),
+#     left_on="benchmark_start_time",
+#     right_on="benchmark_start_time",
+#     by=["agent", "challenge"],
+#     direction="backward",
+# )
+
+df = pd.merge(
+    helicone_df,
+    reports_df,
+    on=["benchmark_start_time", "agent", "challenge"],
+    how="left",
 )

 df.to_pickle("df.pkl")