feat(direct_benchmark): add step-level logging with colored prefixes

- Add step callback to AgentRunner for real-time step logging
- BenchmarkUI now shows:
  - Active runs with current step info
  - Recent steps panel with colored config prefixes
  - Proper Live display refresh (implements __rich_console__)
- Each config gets a distinct color for easy identification
- Verbose mode prints step logs immediately with config prefix
- Fix Live display not updating (pass UI object, not rendered content)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Nicholas Tindle
2026-01-19 23:02:20 -06:00
parent ab95077e5b
commit 0a616d9267
9 changed files with 198 additions and 142 deletions

View File

@@ -30,40 +30,15 @@ jobs:
permissions:
contents: read
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
python-version: ["3.12", "3.13", "3.14"]
platform-os: [ubuntu, macos, macos-arm64, windows]
runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
runs-on: ubuntu-latest
steps:
# Quite slow on macOS (2~4 minutes to set up Docker)
# - name: Set up Docker (macOS)
# if: runner.os == 'macOS'
# uses: crazy-max/ghaction-setup-docker@v3
- name: Start MinIO service (Linux)
if: runner.os == 'Linux'
- name: Start MinIO service
working-directory: '.'
run: |
docker pull minio/minio:edge-cicd
docker run -d -p 9000:9000 minio/minio:edge-cicd
- name: Start MinIO service (macOS)
if: runner.os == 'macOS'
working-directory: ${{ runner.temp }}
run: |
brew install minio/stable/minio
mkdir data
minio server ./data &
# No MinIO on Windows:
# - Windows doesn't support running Linux Docker containers
# - It doesn't seem possible to start background processes on Windows. They are
# killed after the step returns.
# See: https://github.com/actions/runner/issues/598#issuecomment-2011890429
- name: Checkout repository
uses: actions/checkout@v4
with:
@@ -75,41 +50,23 @@ jobs:
git config --global user.name "Auto-GPT-Bot"
git config --global user.email "github-bot@agpt.co"
- name: Set up Python ${{ matrix.python-version }}
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
python-version: "3.12"
- id: get_date
name: Get date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Set up Python dependency cache
# On Windows, unpacking cached dependencies takes longer than just installing them
if: runner.os != 'Windows'
uses: actions/cache@v4
with:
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
path: ~/.cache/pypoetry
key: poetry-${{ runner.os }}-${{ hashFiles('classic/original_autogpt/poetry.lock') }}
- name: Install Poetry (Unix)
if: runner.os != 'Windows'
run: |
curl -sSL https://install.python-poetry.org | python3 -
if [ "${{ runner.os }}" = "macOS" ]; then
PATH="$HOME/.local/bin:$PATH"
echo "$HOME/.local/bin" >> $GITHUB_PATH
fi
- name: Install Poetry (Windows)
if: runner.os == 'Windows'
shell: pwsh
run: |
(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
$env:PATH += ";$env:APPDATA\Python\Scripts"
echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH
- name: Install Poetry
run: curl -sSL https://install.python-poetry.org | python3 -
- name: Install Python dependencies
run: poetry install
@@ -129,7 +86,7 @@ jobs:
CI: true
PLAIN_OUTPUT: True
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }}
S3_ENDPOINT_URL: http://127.0.0.1:9000
AWS_ACCESS_KEY_ID: minioadmin
AWS_SECRET_ACCESS_KEY: minioadmin
@@ -143,7 +100,7 @@ jobs:
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
flags: autogpt-agent,${{ runner.os }}
flags: autogpt-agent
- name: Upload logs to artifact
if: always()

View File

@@ -26,74 +26,31 @@ jobs:
permissions:
contents: read
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
python-version: ["3.12", "3.13", "3.14"]
platform-os: [ubuntu, macos, macos-arm64, windows]
runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
runs-on: ubuntu-latest
steps:
# Quite slow on macOS (2~4 minutes to set up Docker)
# - name: Set up Docker (macOS)
# if: runner.os == 'macOS'
# uses: crazy-max/ghaction-setup-docker@v3
- name: Start MinIO service (Linux)
if: runner.os == 'Linux'
- name: Start MinIO service
working-directory: '.'
run: |
docker pull minio/minio:edge-cicd
docker run -d -p 9000:9000 minio/minio:edge-cicd
- name: Start MinIO service (macOS)
if: runner.os == 'macOS'
working-directory: ${{ runner.temp }}
run: |
brew install minio/stable/minio
mkdir data
minio server ./data &
# No MinIO on Windows:
# - Windows doesn't support running Linux Docker containers
# - It doesn't seem possible to start background processes on Windows. They are
# killed after the step returns.
# See: https://github.com/actions/runner/issues/598#issuecomment-2011890429
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
python-version: "3.12"
- name: Set up Python dependency cache
# On Windows, unpacking cached dependencies takes longer than just installing them
if: runner.os != 'Windows'
uses: actions/cache@v4
with:
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
path: ~/.cache/pypoetry
key: poetry-${{ runner.os }}-${{ hashFiles('classic/forge/poetry.lock') }}
- name: Install Poetry (Unix)
if: runner.os != 'Windows'
run: |
curl -sSL https://install.python-poetry.org | python3 -
if [ "${{ runner.os }}" = "macOS" ]; then
PATH="$HOME/.local/bin:$PATH"
echo "$HOME/.local/bin" >> $GITHUB_PATH
fi
- name: Install Poetry (Windows)
if: runner.os == 'Windows'
shell: pwsh
run: |
(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
$env:PATH += ";$env:APPDATA\Python\Scripts"
echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH
- name: Install Poetry
run: curl -sSL https://install.python-poetry.org | python3 -
- name: Install Python dependencies
run: poetry install
@@ -112,7 +69,7 @@ jobs:
# Secrets are not available to fork PRs (GitHub security feature)
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }}
S3_ENDPOINT_URL: http://127.0.0.1:9000
AWS_ACCESS_KEY_ID: minioadmin
AWS_SECRET_ACCESS_KEY: minioadmin
@@ -126,11 +83,11 @@ jobs:
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
flags: forge,${{ runner.os }}
flags: forge
- name: Upload logs to artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: test-logs-${{ matrix.platform-os }}-${{ matrix.python-version }}
name: test-logs
path: classic/forge/logs/

View File

@@ -4,13 +4,12 @@ from pathlib import Path
from typing import Any, AsyncIterator, Awaitable, ClassVar, Optional
import pytest
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
from agent_protocol_client import AgentApi, Step
from colorama import Fore, Style
from pydantic import BaseModel, Field
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
logger = logging.getLogger(__name__)
@@ -183,4 +182,5 @@ class BaseChallenge(ABC):
@abstractmethod
async def evaluate_task_state(
cls, agent: AgentApi, task_id: str
) -> list[EvalResult]: ...
) -> list[EvalResult]:
...

View File

@@ -10,6 +10,16 @@ from pathlib import Path
from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional
import pytest
from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
from agbenchmark.utils.prompts import (
END_PROMPT,
FEW_SHOT_EXAMPLES,
PROMPT_MAP,
SCORING_MAP,
)
from agent_protocol_client import AgentApi, ApiClient
from agent_protocol_client import Configuration as ClientConfig
from agent_protocol_client import Step
@@ -23,17 +33,6 @@ from pydantic import (
field_validator,
)
from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
from agbenchmark.utils.prompts import (
END_PROMPT,
FEW_SHOT_EXAMPLES,
PROMPT_MAP,
SCORING_MAP,
)
from .base import BaseChallenge, ChallengeInfo
logger = logging.getLogger(__name__)
@@ -69,9 +68,9 @@ class BuiltinChallengeSpec(BaseModel):
class Eval(BaseModel):
type: str
scoring: Optional[Literal["percentage", "scale", "binary"]] = None
template: Optional[Literal["rubric", "reference", "question", "custom"]] = (
None
)
template: Optional[
Literal["rubric", "reference", "question", "custom"]
] = None
examples: Optional[str] = None
@field_validator("scoring", "template")

View File

@@ -5,11 +5,10 @@ from typing import ClassVar, Iterator, Literal
import pytest
import requests
from agent_protocol_client import AgentApi, Step
from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.utils.data_types import Category, EvalResult
from agent_protocol_client import AgentApi, Step
from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
from .base import BaseChallenge, ChallengeInfo
@@ -84,11 +83,13 @@ def resolve_uri(uri: str) -> str:
class Eval(ABC):
@abstractmethod
def evaluate(self, string: str) -> bool: ...
def evaluate(self, string: str) -> bool:
...
@property
@abstractmethod
def description(self) -> str: ...
def description(self) -> str:
...
class BaseStringEval(BaseModel, Eval):

View File

@@ -88,10 +88,16 @@ class BenchmarkHarness:
if progress.result:
all_results[progress.config_name].append(progress.result)
# Create step callback if UI supports it
step_callback = None
if hasattr(ui, "log_step"):
step_callback = ui.log_step
# Create executor
executor = ParallelExecutor(
max_parallel=self.config.max_parallel,
on_progress=on_progress,
on_step=step_callback,
attempts=self.config.attempts,
no_cutoff=self.config.no_cutoff,
)
@@ -101,7 +107,8 @@ class BenchmarkHarness:
# Run with or without live display
if isinstance(ui, BenchmarkUI) and ui_mode == "default":
with Live(ui.render_live_display(), console=console, refresh_per_second=4):
# Pass the UI object itself so Live can refresh it
with Live(ui, console=console, refresh_per_second=4):
async for _ in executor.execute_matrix(
self.config.configs,
challenges,

View File

@@ -6,7 +6,7 @@ from typing import AsyncIterator, Callable, Optional
from .evaluator import Evaluator
from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress
from .runner import AgentRunner
from .runner import AgentRunner, StepCallback
class ParallelExecutor:
@@ -16,11 +16,13 @@ class ParallelExecutor:
self,
max_parallel: int = 4,
on_progress: Optional[Callable[[ExecutionProgress], None]] = None,
on_step: Optional[StepCallback] = None,
attempts: int = 1,
no_cutoff: bool = False,
):
self.max_parallel = max_parallel
self.on_progress = on_progress
self.on_step = on_step
self.attempts = attempts
self.no_cutoff = no_cutoff
self._semaphore = asyncio.Semaphore(max_parallel)
@@ -86,7 +88,12 @@ class ParallelExecutor:
)
# Run the challenge (with modified timeout if no_cutoff is set)
runner = AgentRunner(config, workspace_root, no_cutoff=self.no_cutoff)
runner = AgentRunner(
config,
workspace_root,
no_cutoff=self.no_cutoff,
step_callback=self.on_step,
)
result = await runner.run_challenge(challenge, attempt=attempt)
# Evaluate result

View File

@@ -5,17 +5,20 @@ import shutil
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Optional
from forge.file_storage import FileStorageBackendName, get_storage
from forge.llm.providers import MultiProvider
from typing import Callable, Optional
from autogpt.agent_factory.configurators import create_agent
from autogpt.agents.agent import Agent
from autogpt.app.config import AppConfig, ConfigBuilder
from forge.file_storage import FileStorageBackendName, get_storage
from forge.llm.providers import MultiProvider
from .models import BenchmarkConfig, Challenge, ChallengeResult, StepResult
# Type for step logging callback
StepCallback = Callable[[str, str, int, str, str, bool], None]
# Args: config_name, challenge_name, step_num, tool_name, result_preview, is_error
class AgentRunner:
"""Runs a single agent instance for a challenge."""
@@ -25,10 +28,12 @@ class AgentRunner:
config: BenchmarkConfig,
workspace_root: Path,
no_cutoff: bool = False,
step_callback: Optional[StepCallback] = None,
):
self.config = config
self.workspace_root = workspace_root
self.no_cutoff = no_cutoff
self.step_callback = step_callback
self._agent: Optional[Agent] = None
self._workspace: Optional[Path] = None
@@ -210,20 +215,42 @@ class AgentRunner:
step_cost = 0.0 # TODO: Extract from LLM provider
cumulative_cost += step_cost
# Get result info
result_str = str(
result.outputs if hasattr(result, "outputs") else result
)
is_error = hasattr(result, "status") and result.status == "error"
# Record step
steps.append(
StepResult(
step_num=step_num + 1,
tool_name=proposal.use_tool.name,
tool_args=proposal.use_tool.arguments,
result=str(
result.outputs if hasattr(result, "outputs") else result
),
is_error=hasattr(result, "status") and result.status == "error",
result=result_str,
is_error=is_error,
cumulative_cost=cumulative_cost,
)
)
# Call step callback if provided
if self.step_callback:
# Truncate result for display
result_preview = (
result_str[:100] + "..."
if len(result_str) > 100
else result_str
)
result_preview = result_preview.replace("\n", " ")
self.step_callback(
self.config.config_name,
challenge.name,
step_num + 1,
proposal.use_tool.name,
result_preview,
is_error,
)
return False # Hit max steps
# Run with or without timeout

View File

@@ -3,7 +3,7 @@
from datetime import datetime
from typing import Optional
from rich.console import Console, Group
from rich.console import Console, Group, RenderableType
from rich.live import Live
from rich.panel import Panel
from rich.progress import (
@@ -22,6 +22,18 @@ from .models import ChallengeResult, ExecutionProgress
console = Console()
# Colors for different configs (cycling through for parallel runs)
CONFIG_COLORS = [
"cyan",
"green",
"yellow",
"magenta",
"blue",
"red",
"bright_cyan",
"bright_green",
]
class BenchmarkUI:
"""Rich UI for benchmark progress and results."""
@@ -33,8 +45,11 @@ class BenchmarkUI:
# Track state
self.active_runs: dict[str, str] = {} # config_name -> challenge_name
self.active_steps: dict[str, str] = {} # config_name -> current step info
self.completed: list[ChallengeResult] = []
self.results_by_config: dict[str, list[ChallengeResult]] = {}
self.recent_steps: list[tuple[str, str, int, str, bool]] = [] # Last N steps
self.config_colors: dict[str, str] = {} # config_name -> color
# Progress tracking
self.progress = Progress(
@@ -53,17 +68,56 @@ class BenchmarkUI:
self.start_time = datetime.now()
self.total_challenges = total_challenges
self.results_by_config = {config: [] for config in configs}
# Assign colors to configs
for i, config in enumerate(configs):
self.config_colors[config] = CONFIG_COLORS[i % len(CONFIG_COLORS)]
self.main_task = self.progress.add_task(
"[cyan]Running benchmarks...", total=total_challenges
)
def get_config_color(self, config_name: str) -> str:
"""Get the assigned color for a config."""
return self.config_colors.get(config_name, "white")
def log_step(
self,
config_name: str,
challenge_name: str,
step_num: int,
tool_name: str,
result_preview: str,
is_error: bool,
) -> None:
"""Log a step execution (called from AgentRunner)."""
# Update active step info
self.active_steps[config_name] = f"step {step_num}: {tool_name}"
# Add to recent steps (keep last 10)
self.recent_steps.append(
(config_name, challenge_name, step_num, tool_name, is_error)
)
if len(self.recent_steps) > 10:
self.recent_steps.pop(0)
# In verbose mode, print immediately
if self.verbose:
color = self.get_config_color(config_name)
status = "[red]ERR[/red]" if is_error else "[green]OK[/green]"
console.print(
f"[{color}][{config_name}][/{color}] {challenge_name} "
f"step {step_num}: {tool_name} {status}"
)
def update(self, progress: ExecutionProgress) -> None:
"""Update UI with execution progress."""
if progress.status == "starting":
self.active_runs[progress.config_name] = progress.challenge_name
self.active_steps[progress.config_name] = "starting..."
elif progress.status in ("completed", "failed"):
if progress.config_name in self.active_runs:
del self.active_runs[progress.config_name]
if progress.config_name in self.active_steps:
del self.active_steps[progress.config_name]
if progress.result:
self.completed.append(progress.result)
self.results_by_config[progress.config_name].append(progress.result)
@@ -84,17 +138,20 @@ class BenchmarkUI:
def render_active_runs(self) -> Panel:
"""Render panel showing active runs."""
if not self.active_runs:
content = Text("No active runs", style="dim")
content = Text("Waiting for runs to start...", style="dim")
else:
lines = []
for config_name, challenge_name in self.active_runs.items():
color = self.get_config_color(config_name)
step_info = self.active_steps.get(config_name, "")
lines.append(
Text.assemble(
(" ", ""),
("\u25cf ", "yellow"), # Bullet point
(f"{config_name}", "cyan"),
(f"{config_name}", color),
(" \u2192 ", "dim"), # Arrow
(challenge_name, "white"),
(f" ({step_info})", "dim") if step_info else ("", ""),
)
)
content = Group(*lines)
@@ -164,6 +221,44 @@ class BenchmarkUI:
border_style="green" if self.completed else "dim",
)
def render_recent_steps(self) -> Panel:
"""Render panel showing recent step executions."""
if not self.recent_steps:
content = Text("No steps yet", style="dim")
else:
lines = []
for (
config_name,
challenge,
step_num,
tool_name,
is_error,
) in self.recent_steps[-5:]:
color = self.get_config_color(config_name)
status = (
Text("\u2717", style="red")
if is_error
else Text("\u2713", style="green")
)
lines.append(
Text.assemble(
(" ", ""),
status,
(" ", ""),
(f"[{config_name}]", color),
(" ", ""),
(f"{challenge} #{step_num}: ", "dim"),
(tool_name, "white"),
)
)
content = Group(*lines)
return Panel(
content,
title="[bold]Recent Steps[/bold]",
border_style="dim",
)
def render_live_display(self) -> Group:
"""Render the full live display."""
return Group(
@@ -171,9 +266,15 @@ class BenchmarkUI:
"",
self.render_active_runs(),
"",
self.render_recent_steps(),
"",
self.render_recent_completions(),
)
def __rich_console__(self, console: Console, options) -> RenderableType:
"""Support for Rich Live display - called on each refresh."""
yield self.render_live_display()
def print_final_summary(self) -> None:
"""Print final summary after all benchmarks complete."""
elapsed = (