mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-12 00:28:31 -05:00
Compare commits
1 Commits
update-ins
...
benchmark/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8b0579a87c |
@@ -62,6 +62,9 @@ def start():
|
||||
@click.option(
|
||||
"-N", "--attempts", default=1, help="Number of times to run each challenge."
|
||||
)
|
||||
@click.option(
|
||||
"-P", "--parallel-tasks", default=1, help="Number of challenges to run in parallel."
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--category",
|
||||
@@ -111,6 +114,7 @@ def run(
|
||||
category: tuple[str],
|
||||
skip_category: tuple[str],
|
||||
attempts: int,
|
||||
parallel_tasks: int,
|
||||
cutoff: Optional[int] = None,
|
||||
backend: Optional[bool] = False,
|
||||
# agent_path: Optional[Path] = None,
|
||||
@@ -158,6 +162,7 @@ def run(
|
||||
categories=category,
|
||||
skip_categories=skip_category,
|
||||
attempts_per_challenge=attempts,
|
||||
concurrent_tasks=parallel_tasks,
|
||||
cutoff=cutoff,
|
||||
)
|
||||
|
||||
@@ -177,6 +182,7 @@ def run(
|
||||
categories=category,
|
||||
skip_categories=skip_category,
|
||||
attempts_per_challenge=attempts,
|
||||
concurrent_tasks=parallel_tasks,
|
||||
cutoff=cutoff,
|
||||
)
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ def run_benchmark(
|
||||
categories: tuple[str] = tuple(),
|
||||
skip_categories: tuple[str] = tuple(),
|
||||
attempts_per_challenge: int = 1,
|
||||
concurrent_tasks: int = 1,
|
||||
mock: bool = False,
|
||||
no_dep: bool = False,
|
||||
no_cutoff: bool = False,
|
||||
@@ -100,6 +101,9 @@ def run_benchmark(
|
||||
if attempts_per_challenge > 1:
|
||||
pytest_args.append(f"--attempts={attempts_per_challenge}")
|
||||
|
||||
if concurrent_tasks > 1:
|
||||
pytest_args.append(f"--tests-per-worker={concurrent_tasks}")
|
||||
|
||||
if cutoff:
|
||||
pytest_args.append(f"--cutoff={cutoff}")
|
||||
logger.debug(f"Setting cuttoff override to {cutoff} seconds.")
|
||||
|
||||
@@ -3,10 +3,11 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.reports.processing.graphs import save_single_radar_chart
|
||||
@@ -20,39 +21,39 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SingletonReportManager:
|
||||
instance = None
|
||||
_instance = None
|
||||
_lock: ClassVar[threading.Lock] = threading.Lock()
|
||||
|
||||
INFO_MANAGER: "SessionReportManager"
|
||||
REGRESSION_MANAGER: "RegressionTestsTracker"
|
||||
SUCCESS_RATE_TRACKER: "SuccessRatesTracker"
|
||||
|
||||
def __new__(cls):
|
||||
if not cls.instance:
|
||||
cls.instance = super(SingletonReportManager, cls).__new__(cls)
|
||||
with cls._lock:
|
||||
if not cls._instance:
|
||||
cls._instance = super(SingletonReportManager, cls).__new__(cls)
|
||||
|
||||
agent_benchmark_config = AgentBenchmarkConfig.load()
|
||||
benchmark_start_time_dt = datetime.now(
|
||||
timezone.utc
|
||||
) # or any logic to fetch the datetime
|
||||
agent_benchmark_config = AgentBenchmarkConfig.load()
|
||||
benchmark_start_time_dt = datetime.now(timezone.utc)
|
||||
|
||||
# Make the Managers class attributes
|
||||
cls.INFO_MANAGER = SessionReportManager(
|
||||
agent_benchmark_config.get_report_dir(benchmark_start_time_dt)
|
||||
/ "report.json",
|
||||
benchmark_start_time_dt,
|
||||
)
|
||||
cls.REGRESSION_MANAGER = RegressionTestsTracker(
|
||||
agent_benchmark_config.regression_tests_file
|
||||
)
|
||||
cls.SUCCESS_RATE_TRACKER = SuccessRatesTracker(
|
||||
agent_benchmark_config.success_rate_file
|
||||
)
|
||||
# Make the Managers class attributes
|
||||
cls.INFO_MANAGER = SessionReportManager(
|
||||
agent_benchmark_config.get_report_dir(benchmark_start_time_dt)
|
||||
/ "report.json",
|
||||
benchmark_start_time_dt,
|
||||
)
|
||||
cls.REGRESSION_MANAGER = RegressionTestsTracker(
|
||||
agent_benchmark_config.regression_tests_file
|
||||
)
|
||||
cls.SUCCESS_RATE_TRACKER = SuccessRatesTracker(
|
||||
agent_benchmark_config.success_rate_file
|
||||
)
|
||||
|
||||
return cls.instance
|
||||
return cls._instance
|
||||
|
||||
@classmethod
|
||||
def clear_instance(cls):
|
||||
cls.instance = None
|
||||
cls._instance = None
|
||||
cls.INFO_MANAGER = None
|
||||
cls.REGRESSION_MANAGER = None
|
||||
cls.SUCCESS_RATE_TRACKER = None
|
||||
@@ -131,6 +132,12 @@ class SessionReportManager(BaseReportManager):
|
||||
|
||||
self.save()
|
||||
|
||||
def get_test_report(self, test_name: str) -> Test | None:
|
||||
if isinstance(self.tests, Report):
|
||||
return self.tests.tests.get(test_name)
|
||||
else:
|
||||
return self.tests.get(test_name)
|
||||
|
||||
def finalize_session_report(self, config: AgentBenchmarkConfig) -> None:
|
||||
command = " ".join(sys.argv)
|
||||
|
||||
|
||||
39
benchmark/poetry.lock
generated
39
benchmark/poetry.lock
generated
@@ -1946,6 +1946,17 @@ files = [
|
||||
[package.extras]
|
||||
tests = ["pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "py"
|
||||
version = "1.11.0"
|
||||
description = "library with cross-python path, ini-parsing, io, code, log facilities"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
files = [
|
||||
{file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
|
||||
{file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyasn1"
|
||||
version = "0.5.1"
|
||||
@@ -2137,6 +2148,21 @@ pytest = ">=7.0.0"
|
||||
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
|
||||
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-parallel"
|
||||
version = "0.1.1"
|
||||
description = "a pytest plugin for parallel and concurrent testing"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pytest-parallel-0.1.1.tar.gz", hash = "sha256:9aac3fc199a168c0a8559b60249d9eb254de7af58c12cee0310b54d4affdbfab"},
|
||||
{file = "pytest_parallel-0.1.1-py3-none-any.whl", hash = "sha256:9e3703015b0eda52be9e07d2ba3498f09340a56d5c79a39b50f22fc5c38212fe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = ">=3.0.0"
|
||||
tblib = "*"
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
@@ -2431,6 +2457,17 @@ anyio = ">=3.4.0,<5"
|
||||
[package.extras]
|
||||
full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"]
|
||||
|
||||
[[package]]
|
||||
name = "tblib"
|
||||
version = "3.0.0"
|
||||
description = "Traceback serialization library."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "tblib-3.0.0-py3-none-any.whl", hash = "sha256:80a6c77e59b55e83911e1e607c649836a69c103963c5f28a46cbeef44acf8129"},
|
||||
{file = "tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.10.2"
|
||||
@@ -2760,4 +2797,4 @@ multidict = ">=4.0"
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "e0d1f991958a5d630287c7bb668e7fdc6183630e06196cf6f507a086be10baec"
|
||||
content-hash = "4a4e53f252c8996b172bbb35a730197c07c53d7b50bf1d21964d3b2237495066"
|
||||
|
||||
@@ -25,7 +25,9 @@ networkx = "^3.1"
|
||||
colorama = "^0.4.6"
|
||||
pyvis = "^0.3.2"
|
||||
selenium = "^4.11.2"
|
||||
py = "^1.11.0" # needed for pytest-parallel
|
||||
pytest-asyncio = "^0.21.1"
|
||||
pytest-parallel = "^0.1.1"
|
||||
uvicorn = "^0.23.2"
|
||||
fastapi = "^0.99.0"
|
||||
python-multipart = "^0.0.6"
|
||||
|
||||
Reference in New Issue
Block a user