mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-03 11:24:57 -05:00
- **FIX ALL LINT/TYPE ERRORS IN AUTOGPT, FORGE, AND BENCHMARK** ### Linting - Clean up linter configs for `autogpt`, `forge`, and `benchmark` - Add type checking with Pyright - Create unified pre-commit config - Create unified linting and type checking CI workflow ### Testing - Synchronize CI test setups for `autogpt`, `forge`, and `benchmark` - Add missing pytest-cov to benchmark dependencies - Mark GCS tests as slow to speed up pre-commit test runs - Repair `forge` test suite - Add `AgentDB.close()` method for test DB teardown in db_test.py - Use actual temporary dir instead of forge/test_workspace/ - Move left-behind dependencies for moved `forge`-code to from autogpt to forge ### Notable type changes - Replace uses of `ChatModelProvider` by `MultiProvider` - Removed unnecessary exports from various __init__.py - Simplify `FileStorage.open_file` signature by removing `IOBase` from return type union - Implement `S3BinaryIOWrapper(BinaryIO)` type interposer for `S3FileStorage` - Expand overloads of `GCSFileStorage.open_file` for improved typing of read and write modes Had to silence type checking for the extra overloads, because (I think) Pyright is reporting a false-positive: https://github.com/microsoft/pyright/issues/8007 - Change `count_tokens`, `get_tokenizer`, `count_message_tokens` methods on `ModelProvider`s from class methods to instance methods - Move `CompletionModelFunction.schema` method -> helper function `format_function_def_for_openai` in `forge.llm.providers.openai` - Rename `ModelProvider` -> `BaseModelProvider` - Rename `ChatModelProvider` -> `BaseChatModelProvider` - Add type `ChatModelProvider` which is a union of all subclasses of `BaseChatModelProvider` ### Removed rather than fixed - Remove deprecated and broken autogpt/agbenchmark_config/benchmarks.py - Various base classes and properties on base classes in `forge.llm.providers.schema` and `forge.models.providers` ### Fixes for other issues that came to light - Clean up `forge.agent_protocol.api_router`, `forge.agent_protocol.database`, and `forge.agent.agent` - Add fallback behavior to `ImageGeneratorComponent` - Remove test for deprecated failure behavior - Fix `agbenchmark.challenges.builtin` challenge exclusion mechanism on Windows - Fix `_tool_calls_compat_extract_calls` in `forge.llm.providers.openai` - Add support for `any` (= no type specified) in `JSONSchema.typescript_type`
159 lines
4.7 KiB
Python
159 lines
4.7 KiB
Python
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional, Sequence
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
from agbenchmark.challenges import get_unique_categories
|
|
from agbenchmark.config import AgentBenchmarkConfig
|
|
|
|
load_dotenv()
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def run_benchmark(
|
|
config: AgentBenchmarkConfig,
|
|
maintain: bool = False,
|
|
improve: bool = False,
|
|
explore: bool = False,
|
|
tests: tuple[str, ...] = tuple(),
|
|
categories: tuple[str, ...] = tuple(),
|
|
skip_categories: tuple[str, ...] = tuple(),
|
|
attempts_per_challenge: int = 1,
|
|
mock: bool = False,
|
|
no_dep: bool = False,
|
|
no_cutoff: bool = False,
|
|
cutoff: Optional[int] = None,
|
|
keep_answers: bool = False,
|
|
server: bool = False,
|
|
) -> int:
|
|
"""
|
|
Starts the benchmark. If a category flag is provided, only challenges with the
|
|
corresponding mark will be run.
|
|
"""
|
|
import pytest
|
|
|
|
from agbenchmark.reports.ReportManager import SingletonReportManager
|
|
|
|
validate_args(
|
|
maintain=maintain,
|
|
improve=improve,
|
|
explore=explore,
|
|
tests=tests,
|
|
categories=categories,
|
|
skip_categories=skip_categories,
|
|
no_cutoff=no_cutoff,
|
|
cutoff=cutoff,
|
|
)
|
|
|
|
SingletonReportManager()
|
|
|
|
for key, value in vars(config).items():
|
|
logger.debug(f"config.{key} = {repr(value)}")
|
|
|
|
pytest_args = ["-vs"]
|
|
|
|
if tests:
|
|
logger.info(f"Running specific test(s): {' '.join(tests)}")
|
|
pytest_args += [f"--test={t}" for t in tests]
|
|
else:
|
|
all_categories = get_unique_categories()
|
|
|
|
if categories or skip_categories:
|
|
categories_to_run = set(categories) or all_categories
|
|
if skip_categories:
|
|
categories_to_run = categories_to_run.difference(set(skip_categories))
|
|
assert categories_to_run, "Error: You can't skip all categories"
|
|
pytest_args += [f"--category={c}" for c in categories_to_run]
|
|
logger.info(f"Running tests of category: {categories_to_run}")
|
|
else:
|
|
logger.info("Running all categories")
|
|
|
|
if maintain:
|
|
logger.info("Running only regression tests")
|
|
elif improve:
|
|
logger.info("Running only non-regression tests")
|
|
elif explore:
|
|
logger.info("Only attempt challenges that have never been beaten")
|
|
|
|
if mock:
|
|
# TODO: unhack
|
|
os.environ[
|
|
"IS_MOCK"
|
|
] = "True" # ugly hack to make the mock work when calling from API
|
|
|
|
# Pass through flags
|
|
for flag, active in {
|
|
"--maintain": maintain,
|
|
"--improve": improve,
|
|
"--explore": explore,
|
|
"--no-dep": no_dep,
|
|
"--mock": mock,
|
|
"--nc": no_cutoff,
|
|
"--keep-answers": keep_answers,
|
|
}.items():
|
|
if active:
|
|
pytest_args.append(flag)
|
|
|
|
if attempts_per_challenge > 1:
|
|
pytest_args.append(f"--attempts={attempts_per_challenge}")
|
|
|
|
if cutoff:
|
|
pytest_args.append(f"--cutoff={cutoff}")
|
|
logger.debug(f"Setting cuttoff override to {cutoff} seconds.")
|
|
|
|
current_dir = Path(__file__).resolve().parent
|
|
pytest_args.append(str(current_dir / "generate_test.py"))
|
|
|
|
pytest_args.append("--cache-clear")
|
|
logger.debug(f"Running Pytest with args: {pytest_args}")
|
|
exit_code = pytest.main(pytest_args)
|
|
|
|
SingletonReportManager.clear_instance()
|
|
return exit_code
|
|
|
|
|
|
class InvalidInvocationError(ValueError):
|
|
pass
|
|
|
|
|
|
def validate_args(
|
|
maintain: bool,
|
|
improve: bool,
|
|
explore: bool,
|
|
tests: Sequence[str],
|
|
categories: Sequence[str],
|
|
skip_categories: Sequence[str],
|
|
no_cutoff: bool,
|
|
cutoff: Optional[int],
|
|
) -> None:
|
|
if categories:
|
|
all_categories = get_unique_categories()
|
|
invalid_categories = set(categories) - all_categories
|
|
if invalid_categories:
|
|
raise InvalidInvocationError(
|
|
"One or more invalid categories were specified: "
|
|
f"{', '.join(invalid_categories)}.\n"
|
|
f"Valid categories are: {', '.join(all_categories)}."
|
|
)
|
|
|
|
if (maintain + improve + explore) > 1:
|
|
raise InvalidInvocationError(
|
|
"You can't use --maintain, --improve or --explore at the same time. "
|
|
"Please choose one."
|
|
)
|
|
|
|
if tests and (categories or skip_categories or maintain or improve or explore):
|
|
raise InvalidInvocationError(
|
|
"If you're running a specific test make sure no other options are "
|
|
"selected. Please just pass the --test."
|
|
)
|
|
|
|
if no_cutoff and cutoff:
|
|
raise InvalidInvocationError(
|
|
"You can't use both --nc and --cutoff at the same time. "
|
|
"Please choose one."
|
|
)
|