mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-03 11:24:57 -05:00
- **FIX ALL LINT/TYPE ERRORS IN AUTOGPT, FORGE, AND BENCHMARK** ### Linting - Clean up linter configs for `autogpt`, `forge`, and `benchmark` - Add type checking with Pyright - Create unified pre-commit config - Create unified linting and type checking CI workflow ### Testing - Synchronize CI test setups for `autogpt`, `forge`, and `benchmark` - Add missing pytest-cov to benchmark dependencies - Mark GCS tests as slow to speed up pre-commit test runs - Repair `forge` test suite - Add `AgentDB.close()` method for test DB teardown in db_test.py - Use actual temporary dir instead of forge/test_workspace/ - Move left-behind dependencies for moved `forge`-code to from autogpt to forge ### Notable type changes - Replace uses of `ChatModelProvider` by `MultiProvider` - Removed unnecessary exports from various __init__.py - Simplify `FileStorage.open_file` signature by removing `IOBase` from return type union - Implement `S3BinaryIOWrapper(BinaryIO)` type interposer for `S3FileStorage` - Expand overloads of `GCSFileStorage.open_file` for improved typing of read and write modes Had to silence type checking for the extra overloads, because (I think) Pyright is reporting a false-positive: https://github.com/microsoft/pyright/issues/8007 - Change `count_tokens`, `get_tokenizer`, `count_message_tokens` methods on `ModelProvider`s from class methods to instance methods - Move `CompletionModelFunction.schema` method -> helper function `format_function_def_for_openai` in `forge.llm.providers.openai` - Rename `ModelProvider` -> `BaseModelProvider` - Rename `ChatModelProvider` -> `BaseChatModelProvider` - Add type `ChatModelProvider` which is a union of all subclasses of `BaseChatModelProvider` ### Removed rather than fixed - Remove deprecated and broken autogpt/agbenchmark_config/benchmarks.py - Various base classes and properties on base classes in `forge.llm.providers.schema` and `forge.models.providers` ### Fixes for other issues that came to light - Clean up `forge.agent_protocol.api_router`, `forge.agent_protocol.database`, and `forge.agent.agent` - Add fallback behavior to `ImageGeneratorComponent` - Remove test for deprecated failure behavior - Fix `agbenchmark.challenges.builtin` challenge exclusion mechanism on Windows - Fix `_tool_calls_compat_extract_calls` in `forge.llm.providers.openai` - Add support for `any` (= no type specified) in `JSONSchema.typescript_type`
108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
import logging
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import AsyncIterator, Awaitable, ClassVar, Optional
|
|
|
|
import pytest
|
|
from agent_protocol_client import AgentApi, Step
|
|
from colorama import Fore, Style
|
|
from pydantic import BaseModel, Field
|
|
|
|
from agbenchmark.config import AgentBenchmarkConfig
|
|
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ChallengeInfo(BaseModel):
|
|
eval_id: str = ""
|
|
name: str
|
|
task: str
|
|
task_artifacts_dir: Optional[Path] = None
|
|
category: list[Category]
|
|
difficulty: Optional[DifficultyLevel] = None
|
|
description: Optional[str] = None
|
|
dependencies: list[str] = Field(default_factory=list)
|
|
reference_answer: Optional[str]
|
|
|
|
source_uri: str
|
|
"""Internal reference indicating the source of the challenge specification"""
|
|
|
|
available: bool = True
|
|
unavailable_reason: str = ""
|
|
|
|
|
|
class BaseChallenge(ABC):
|
|
"""
|
|
The base class and shared interface for all specific challenge implementations.
|
|
"""
|
|
|
|
info: ClassVar[ChallengeInfo]
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def from_source_uri(cls, source_uri: str) -> type["BaseChallenge"]:
|
|
"""
|
|
Construct an individual challenge subclass from a suitable `source_uri` (as in
|
|
`ChallengeInfo.source_uri`).
|
|
"""
|
|
...
|
|
|
|
@abstractmethod
|
|
def test_method(
|
|
self,
|
|
config: AgentBenchmarkConfig,
|
|
request: pytest.FixtureRequest,
|
|
i_attempt: int,
|
|
) -> None | Awaitable[None]:
|
|
"""
|
|
Test method for use by Pytest-based benchmark sessions. Should return normally
|
|
if the challenge passes, and raise a (preferably descriptive) error otherwise.
|
|
"""
|
|
...
|
|
|
|
@classmethod
|
|
async def run_challenge(
|
|
cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
|
|
) -> AsyncIterator[Step]:
|
|
"""
|
|
Runs the challenge on the subject agent with the specified timeout.
|
|
Also prints basic challenge and status info to STDOUT.
|
|
|
|
Params:
|
|
config: The subject agent's benchmark config.
|
|
timeout: Timeout (seconds) after which to stop the run if not finished.
|
|
|
|
Yields:
|
|
Step: The steps generated by the agent for the challenge task.
|
|
"""
|
|
# avoid circular import
|
|
from agbenchmark.agent_api_interface import run_api_agent
|
|
|
|
print()
|
|
print(
|
|
f"{Fore.MAGENTA + Style.BRIGHT}{'='*24} "
|
|
f"Starting {cls.info.name} challenge"
|
|
f" {'='*24}{Style.RESET_ALL}"
|
|
)
|
|
print(f"{Fore.CYAN}Timeout:{Fore.RESET} {timeout} seconds")
|
|
print(f"{Fore.CYAN}Task:{Fore.RESET} {cls.info.task}")
|
|
|
|
print()
|
|
logger.debug(f"Starting {cls.info.name} challenge run")
|
|
i = 0
|
|
async for step in run_api_agent(
|
|
cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
|
|
):
|
|
i += 1
|
|
print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
|
|
yield step
|
|
logger.debug(f"Finished {cls.info.name} challenge run")
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
async def evaluate_task_state(
|
|
cls, agent: AgentApi, task_id: str
|
|
) -> list[EvalResult]:
|
|
...
|