diff --git a/.github/workflows/classic-autogpt-ci.yml b/.github/workflows/classic-autogpt-ci.yml index 41ea0a9dfd..3a1b5c42bf 100644 --- a/.github/workflows/classic-autogpt-ci.yml +++ b/.github/workflows/classic-autogpt-ci.yml @@ -30,40 +30,15 @@ jobs: permissions: contents: read timeout-minutes: 30 - strategy: - fail-fast: false - matrix: - python-version: ["3.12", "3.13", "3.14"] - platform-os: [ubuntu, macos, macos-arm64, windows] - runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }} + runs-on: ubuntu-latest steps: - # Quite slow on macOS (2~4 minutes to set up Docker) - # - name: Set up Docker (macOS) - # if: runner.os == 'macOS' - # uses: crazy-max/ghaction-setup-docker@v3 - - - name: Start MinIO service (Linux) - if: runner.os == 'Linux' + - name: Start MinIO service working-directory: '.' run: | docker pull minio/minio:edge-cicd docker run -d -p 9000:9000 minio/minio:edge-cicd - - name: Start MinIO service (macOS) - if: runner.os == 'macOS' - working-directory: ${{ runner.temp }} - run: | - brew install minio/stable/minio - mkdir data - minio server ./data & - - # No MinIO on Windows: - # - Windows doesn't support running Linux Docker containers - # - It doesn't seem possible to start background processes on Windows. They are - # killed after the step returns. - # See: https://github.com/actions/runner/issues/598#issuecomment-2011890429 - - name: Checkout repository uses: actions/checkout@v4 with: @@ -75,41 +50,23 @@ jobs: git config --global user.name "Auto-GPT-Bot" git config --global user.email "github-bot@agpt.co" - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python 3.12 uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: "3.12" - id: get_date name: Get date run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - name: Set up Python dependency cache - # On Windows, unpacking cached dependencies takes longer than just installing them - if: runner.os != 'Windows' uses: actions/cache@v4 with: - path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }} + path: ~/.cache/pypoetry key: poetry-${{ runner.os }}-${{ hashFiles('classic/original_autogpt/poetry.lock') }} - - name: Install Poetry (Unix) - if: runner.os != 'Windows' - run: | - curl -sSL https://install.python-poetry.org | python3 - - - if [ "${{ runner.os }}" = "macOS" ]; then - PATH="$HOME/.local/bin:$PATH" - echo "$HOME/.local/bin" >> $GITHUB_PATH - fi - - - name: Install Poetry (Windows) - if: runner.os == 'Windows' - shell: pwsh - run: | - (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python - - - $env:PATH += ";$env:APPDATA\Python\Scripts" - echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH + - name: Install Poetry + run: curl -sSL https://install.python-poetry.org | python3 - - name: Install Python dependencies run: poetry install @@ -129,7 +86,7 @@ jobs: CI: true PLAIN_OUTPUT: True OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }} + S3_ENDPOINT_URL: http://127.0.0.1:9000 AWS_ACCESS_KEY_ID: minioadmin AWS_SECRET_ACCESS_KEY: minioadmin @@ -143,7 +100,7 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - flags: autogpt-agent,${{ runner.os }} + flags: autogpt-agent - name: Upload logs to artifact if: always() diff --git a/.github/workflows/classic-forge-ci.yml b/.github/workflows/classic-forge-ci.yml index df1431f844..6ee9ab81ed 100644 --- a/.github/workflows/classic-forge-ci.yml +++ b/.github/workflows/classic-forge-ci.yml @@ -26,74 +26,31 @@ jobs: permissions: contents: read timeout-minutes: 30 - strategy: - fail-fast: false - matrix: - python-version: ["3.12", "3.13", "3.14"] - platform-os: [ubuntu, macos, macos-arm64, windows] - runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }} + runs-on: ubuntu-latest steps: - # Quite slow on macOS (2~4 minutes to set up Docker) - # - name: Set up Docker (macOS) - # if: runner.os == 'macOS' - # uses: crazy-max/ghaction-setup-docker@v3 - - - name: Start MinIO service (Linux) - if: runner.os == 'Linux' + - name: Start MinIO service working-directory: '.' run: | docker pull minio/minio:edge-cicd docker run -d -p 9000:9000 minio/minio:edge-cicd - - name: Start MinIO service (macOS) - if: runner.os == 'macOS' - working-directory: ${{ runner.temp }} - run: | - brew install minio/stable/minio - mkdir data - minio server ./data & - - # No MinIO on Windows: - # - Windows doesn't support running Linux Docker containers - # - It doesn't seem possible to start background processes on Windows. They are - # killed after the step returns. - # See: https://github.com/actions/runner/issues/598#issuecomment-2011890429 - - name: Checkout repository uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python 3.12 uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: "3.12" - name: Set up Python dependency cache - # On Windows, unpacking cached dependencies takes longer than just installing them - if: runner.os != 'Windows' uses: actions/cache@v4 with: - path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }} + path: ~/.cache/pypoetry key: poetry-${{ runner.os }}-${{ hashFiles('classic/forge/poetry.lock') }} - - name: Install Poetry (Unix) - if: runner.os != 'Windows' - run: | - curl -sSL https://install.python-poetry.org | python3 - - - if [ "${{ runner.os }}" = "macOS" ]; then - PATH="$HOME/.local/bin:$PATH" - echo "$HOME/.local/bin" >> $GITHUB_PATH - fi - - - name: Install Poetry (Windows) - if: runner.os == 'Windows' - shell: pwsh - run: | - (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python - - - $env:PATH += ";$env:APPDATA\Python\Scripts" - echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH + - name: Install Poetry + run: curl -sSL https://install.python-poetry.org | python3 - - name: Install Python dependencies run: poetry install @@ -112,7 +69,7 @@ jobs: # Secrets are not available to fork PRs (GitHub security feature) OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }} + S3_ENDPOINT_URL: http://127.0.0.1:9000 AWS_ACCESS_KEY_ID: minioadmin AWS_SECRET_ACCESS_KEY: minioadmin @@ -126,11 +83,11 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - flags: forge,${{ runner.os }} + flags: forge - name: Upload logs to artifact if: always() uses: actions/upload-artifact@v4 with: - name: test-logs-${{ matrix.platform-os }}-${{ matrix.python-version }} + name: test-logs path: classic/forge/logs/ diff --git a/classic/direct_benchmark/challenges/base.py b/classic/direct_benchmark/challenges/base.py index c3c5cdf7de..ff73706627 100644 --- a/classic/direct_benchmark/challenges/base.py +++ b/classic/direct_benchmark/challenges/base.py @@ -4,13 +4,12 @@ from pathlib import Path from typing import Any, AsyncIterator, Awaitable, ClassVar, Optional import pytest +from agbenchmark.config import AgentBenchmarkConfig +from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult from agent_protocol_client import AgentApi, Step from colorama import Fore, Style from pydantic import BaseModel, Field -from agbenchmark.config import AgentBenchmarkConfig -from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult - logger = logging.getLogger(__name__) @@ -183,4 +182,5 @@ class BaseChallenge(ABC): @abstractmethod async def evaluate_task_state( cls, agent: AgentApi, task_id: str - ) -> list[EvalResult]: ... + ) -> list[EvalResult]: + ... diff --git a/classic/direct_benchmark/challenges/builtin.py b/classic/direct_benchmark/challenges/builtin.py index d1e0418818..feb2ed8aad 100644 --- a/classic/direct_benchmark/challenges/builtin.py +++ b/classic/direct_benchmark/challenges/builtin.py @@ -10,6 +10,16 @@ from pathlib import Path from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional import pytest +from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder +from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace +from agbenchmark.config import AgentBenchmarkConfig +from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult +from agbenchmark.utils.prompts import ( + END_PROMPT, + FEW_SHOT_EXAMPLES, + PROMPT_MAP, + SCORING_MAP, +) from agent_protocol_client import AgentApi, ApiClient from agent_protocol_client import Configuration as ClientConfig from agent_protocol_client import Step @@ -23,17 +33,6 @@ from pydantic import ( field_validator, ) -from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder -from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace -from agbenchmark.config import AgentBenchmarkConfig -from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult -from agbenchmark.utils.prompts import ( - END_PROMPT, - FEW_SHOT_EXAMPLES, - PROMPT_MAP, - SCORING_MAP, -) - from .base import BaseChallenge, ChallengeInfo logger = logging.getLogger(__name__) @@ -69,9 +68,9 @@ class BuiltinChallengeSpec(BaseModel): class Eval(BaseModel): type: str scoring: Optional[Literal["percentage", "scale", "binary"]] = None - template: Optional[Literal["rubric", "reference", "question", "custom"]] = ( - None - ) + template: Optional[ + Literal["rubric", "reference", "question", "custom"] + ] = None examples: Optional[str] = None @field_validator("scoring", "template") diff --git a/classic/direct_benchmark/challenges/webarena.py b/classic/direct_benchmark/challenges/webarena.py index be8a300be8..e1e8c049cb 100644 --- a/classic/direct_benchmark/challenges/webarena.py +++ b/classic/direct_benchmark/challenges/webarena.py @@ -5,11 +5,10 @@ from typing import ClassVar, Iterator, Literal import pytest import requests -from agent_protocol_client import AgentApi, Step -from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator - from agbenchmark.config import AgentBenchmarkConfig from agbenchmark.utils.data_types import Category, EvalResult +from agent_protocol_client import AgentApi, Step +from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator from .base import BaseChallenge, ChallengeInfo @@ -84,11 +83,13 @@ def resolve_uri(uri: str) -> str: class Eval(ABC): @abstractmethod - def evaluate(self, string: str) -> bool: ... + def evaluate(self, string: str) -> bool: + ... @property @abstractmethod - def description(self) -> str: ... + def description(self) -> str: + ... class BaseStringEval(BaseModel, Eval): diff --git a/classic/direct_benchmark/direct_benchmark/harness.py b/classic/direct_benchmark/direct_benchmark/harness.py index 3e265f4dcc..df94caf439 100644 --- a/classic/direct_benchmark/direct_benchmark/harness.py +++ b/classic/direct_benchmark/direct_benchmark/harness.py @@ -88,10 +88,16 @@ class BenchmarkHarness: if progress.result: all_results[progress.config_name].append(progress.result) + # Create step callback if UI supports it + step_callback = None + if hasattr(ui, "log_step"): + step_callback = ui.log_step + # Create executor executor = ParallelExecutor( max_parallel=self.config.max_parallel, on_progress=on_progress, + on_step=step_callback, attempts=self.config.attempts, no_cutoff=self.config.no_cutoff, ) @@ -101,7 +107,8 @@ class BenchmarkHarness: # Run with or without live display if isinstance(ui, BenchmarkUI) and ui_mode == "default": - with Live(ui.render_live_display(), console=console, refresh_per_second=4): + # Pass the UI object itself so Live can refresh it + with Live(ui, console=console, refresh_per_second=4): async for _ in executor.execute_matrix( self.config.configs, challenges, diff --git a/classic/direct_benchmark/direct_benchmark/parallel.py b/classic/direct_benchmark/direct_benchmark/parallel.py index 954508a6e8..9f2dcc0ed9 100644 --- a/classic/direct_benchmark/direct_benchmark/parallel.py +++ b/classic/direct_benchmark/direct_benchmark/parallel.py @@ -6,7 +6,7 @@ from typing import AsyncIterator, Callable, Optional from .evaluator import Evaluator from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress -from .runner import AgentRunner +from .runner import AgentRunner, StepCallback class ParallelExecutor: @@ -16,11 +16,13 @@ class ParallelExecutor: self, max_parallel: int = 4, on_progress: Optional[Callable[[ExecutionProgress], None]] = None, + on_step: Optional[StepCallback] = None, attempts: int = 1, no_cutoff: bool = False, ): self.max_parallel = max_parallel self.on_progress = on_progress + self.on_step = on_step self.attempts = attempts self.no_cutoff = no_cutoff self._semaphore = asyncio.Semaphore(max_parallel) @@ -86,7 +88,12 @@ class ParallelExecutor: ) # Run the challenge (with modified timeout if no_cutoff is set) - runner = AgentRunner(config, workspace_root, no_cutoff=self.no_cutoff) + runner = AgentRunner( + config, + workspace_root, + no_cutoff=self.no_cutoff, + step_callback=self.on_step, + ) result = await runner.run_challenge(challenge, attempt=attempt) # Evaluate result diff --git a/classic/direct_benchmark/direct_benchmark/runner.py b/classic/direct_benchmark/direct_benchmark/runner.py index 6da689c886..871b5e223d 100644 --- a/classic/direct_benchmark/direct_benchmark/runner.py +++ b/classic/direct_benchmark/direct_benchmark/runner.py @@ -5,17 +5,20 @@ import shutil import tempfile from datetime import datetime from pathlib import Path -from typing import Optional - -from forge.file_storage import FileStorageBackendName, get_storage -from forge.llm.providers import MultiProvider +from typing import Callable, Optional from autogpt.agent_factory.configurators import create_agent from autogpt.agents.agent import Agent from autogpt.app.config import AppConfig, ConfigBuilder +from forge.file_storage import FileStorageBackendName, get_storage +from forge.llm.providers import MultiProvider from .models import BenchmarkConfig, Challenge, ChallengeResult, StepResult +# Type for step logging callback +StepCallback = Callable[[str, str, int, str, str, bool], None] +# Args: config_name, challenge_name, step_num, tool_name, result_preview, is_error + class AgentRunner: """Runs a single agent instance for a challenge.""" @@ -25,10 +28,12 @@ class AgentRunner: config: BenchmarkConfig, workspace_root: Path, no_cutoff: bool = False, + step_callback: Optional[StepCallback] = None, ): self.config = config self.workspace_root = workspace_root self.no_cutoff = no_cutoff + self.step_callback = step_callback self._agent: Optional[Agent] = None self._workspace: Optional[Path] = None @@ -210,20 +215,42 @@ class AgentRunner: step_cost = 0.0 # TODO: Extract from LLM provider cumulative_cost += step_cost + # Get result info + result_str = str( + result.outputs if hasattr(result, "outputs") else result + ) + is_error = hasattr(result, "status") and result.status == "error" + # Record step steps.append( StepResult( step_num=step_num + 1, tool_name=proposal.use_tool.name, tool_args=proposal.use_tool.arguments, - result=str( - result.outputs if hasattr(result, "outputs") else result - ), - is_error=hasattr(result, "status") and result.status == "error", + result=result_str, + is_error=is_error, cumulative_cost=cumulative_cost, ) ) + # Call step callback if provided + if self.step_callback: + # Truncate result for display + result_preview = ( + result_str[:100] + "..." + if len(result_str) > 100 + else result_str + ) + result_preview = result_preview.replace("\n", " ") + self.step_callback( + self.config.config_name, + challenge.name, + step_num + 1, + proposal.use_tool.name, + result_preview, + is_error, + ) + return False # Hit max steps # Run with or without timeout diff --git a/classic/direct_benchmark/direct_benchmark/ui.py b/classic/direct_benchmark/direct_benchmark/ui.py index 2cc09976d2..df4cc97fa5 100644 --- a/classic/direct_benchmark/direct_benchmark/ui.py +++ b/classic/direct_benchmark/direct_benchmark/ui.py @@ -3,7 +3,7 @@ from datetime import datetime from typing import Optional -from rich.console import Console, Group +from rich.console import Console, Group, RenderableType from rich.live import Live from rich.panel import Panel from rich.progress import ( @@ -22,6 +22,18 @@ from .models import ChallengeResult, ExecutionProgress console = Console() +# Colors for different configs (cycling through for parallel runs) +CONFIG_COLORS = [ + "cyan", + "green", + "yellow", + "magenta", + "blue", + "red", + "bright_cyan", + "bright_green", +] + class BenchmarkUI: """Rich UI for benchmark progress and results.""" @@ -33,8 +45,11 @@ class BenchmarkUI: # Track state self.active_runs: dict[str, str] = {} # config_name -> challenge_name + self.active_steps: dict[str, str] = {} # config_name -> current step info self.completed: list[ChallengeResult] = [] self.results_by_config: dict[str, list[ChallengeResult]] = {} + self.recent_steps: list[tuple[str, str, int, str, bool]] = [] # Last N steps + self.config_colors: dict[str, str] = {} # config_name -> color # Progress tracking self.progress = Progress( @@ -53,17 +68,56 @@ class BenchmarkUI: self.start_time = datetime.now() self.total_challenges = total_challenges self.results_by_config = {config: [] for config in configs} + # Assign colors to configs + for i, config in enumerate(configs): + self.config_colors[config] = CONFIG_COLORS[i % len(CONFIG_COLORS)] self.main_task = self.progress.add_task( "[cyan]Running benchmarks...", total=total_challenges ) + def get_config_color(self, config_name: str) -> str: + """Get the assigned color for a config.""" + return self.config_colors.get(config_name, "white") + + def log_step( + self, + config_name: str, + challenge_name: str, + step_num: int, + tool_name: str, + result_preview: str, + is_error: bool, + ) -> None: + """Log a step execution (called from AgentRunner).""" + # Update active step info + self.active_steps[config_name] = f"step {step_num}: {tool_name}" + + # Add to recent steps (keep last 10) + self.recent_steps.append( + (config_name, challenge_name, step_num, tool_name, is_error) + ) + if len(self.recent_steps) > 10: + self.recent_steps.pop(0) + + # In verbose mode, print immediately + if self.verbose: + color = self.get_config_color(config_name) + status = "[red]ERR[/red]" if is_error else "[green]OK[/green]" + console.print( + f"[{color}][{config_name}][/{color}] {challenge_name} " + f"step {step_num}: {tool_name} {status}" + ) + def update(self, progress: ExecutionProgress) -> None: """Update UI with execution progress.""" if progress.status == "starting": self.active_runs[progress.config_name] = progress.challenge_name + self.active_steps[progress.config_name] = "starting..." elif progress.status in ("completed", "failed"): if progress.config_name in self.active_runs: del self.active_runs[progress.config_name] + if progress.config_name in self.active_steps: + del self.active_steps[progress.config_name] if progress.result: self.completed.append(progress.result) self.results_by_config[progress.config_name].append(progress.result) @@ -84,17 +138,20 @@ class BenchmarkUI: def render_active_runs(self) -> Panel: """Render panel showing active runs.""" if not self.active_runs: - content = Text("No active runs", style="dim") + content = Text("Waiting for runs to start...", style="dim") else: lines = [] for config_name, challenge_name in self.active_runs.items(): + color = self.get_config_color(config_name) + step_info = self.active_steps.get(config_name, "") lines.append( Text.assemble( (" ", ""), ("\u25cf ", "yellow"), # Bullet point - (f"{config_name}", "cyan"), + (f"{config_name}", color), (" \u2192 ", "dim"), # Arrow (challenge_name, "white"), + (f" ({step_info})", "dim") if step_info else ("", ""), ) ) content = Group(*lines) @@ -164,6 +221,44 @@ class BenchmarkUI: border_style="green" if self.completed else "dim", ) + def render_recent_steps(self) -> Panel: + """Render panel showing recent step executions.""" + if not self.recent_steps: + content = Text("No steps yet", style="dim") + else: + lines = [] + for ( + config_name, + challenge, + step_num, + tool_name, + is_error, + ) in self.recent_steps[-5:]: + color = self.get_config_color(config_name) + status = ( + Text("\u2717", style="red") + if is_error + else Text("\u2713", style="green") + ) + lines.append( + Text.assemble( + (" ", ""), + status, + (" ", ""), + (f"[{config_name}]", color), + (" ", ""), + (f"{challenge} #{step_num}: ", "dim"), + (tool_name, "white"), + ) + ) + content = Group(*lines) + + return Panel( + content, + title="[bold]Recent Steps[/bold]", + border_style="dim", + ) + def render_live_display(self) -> Group: """Render the full live display.""" return Group( @@ -171,9 +266,15 @@ class BenchmarkUI: "", self.render_active_runs(), "", + self.render_recent_steps(), + "", self.render_recent_completions(), ) + def __rich_console__(self, console: Console, options) -> RenderableType: + """Support for Rich Live display - called on each refresh.""" + yield self.render_live_display() + def print_final_summary(self) -> None: """Print final summary after all benchmarks complete.""" elapsed = (