mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-12 07:45:14 -05:00
feat(direct_benchmark): add step-level logging with colored prefixes
- Add step callback to AgentRunner for real-time step logging - BenchmarkUI now shows: - Active runs with current step info - Recent steps panel with colored config prefixes - Proper Live display refresh (implements __rich_console__) - Each config gets a distinct color for easy identification - Verbose mode prints step logs immediately with config prefix - Fix Live display not updating (pass UI object, not rendered content) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -4,13 +4,12 @@ from pathlib import Path
|
||||
from typing import Any, AsyncIterator, Awaitable, ClassVar, Optional
|
||||
|
||||
import pytest
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
from agent_protocol_client import AgentApi, Step
|
||||
from colorama import Fore, Style
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -183,4 +182,5 @@ class BaseChallenge(ABC):
|
||||
@abstractmethod
|
||||
async def evaluate_task_state(
|
||||
cls, agent: AgentApi, task_id: str
|
||||
) -> list[EvalResult]: ...
|
||||
) -> list[EvalResult]:
|
||||
...
|
||||
|
||||
@@ -10,6 +10,16 @@ from pathlib import Path
|
||||
from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional
|
||||
|
||||
import pytest
|
||||
from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
|
||||
from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
from agbenchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
FEW_SHOT_EXAMPLES,
|
||||
PROMPT_MAP,
|
||||
SCORING_MAP,
|
||||
)
|
||||
from agent_protocol_client import AgentApi, ApiClient
|
||||
from agent_protocol_client import Configuration as ClientConfig
|
||||
from agent_protocol_client import Step
|
||||
@@ -23,17 +33,6 @@ from pydantic import (
|
||||
field_validator,
|
||||
)
|
||||
|
||||
from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
|
||||
from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
from agbenchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
FEW_SHOT_EXAMPLES,
|
||||
PROMPT_MAP,
|
||||
SCORING_MAP,
|
||||
)
|
||||
|
||||
from .base import BaseChallenge, ChallengeInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -69,9 +68,9 @@ class BuiltinChallengeSpec(BaseModel):
|
||||
class Eval(BaseModel):
|
||||
type: str
|
||||
scoring: Optional[Literal["percentage", "scale", "binary"]] = None
|
||||
template: Optional[Literal["rubric", "reference", "question", "custom"]] = (
|
||||
None
|
||||
)
|
||||
template: Optional[
|
||||
Literal["rubric", "reference", "question", "custom"]
|
||||
] = None
|
||||
examples: Optional[str] = None
|
||||
|
||||
@field_validator("scoring", "template")
|
||||
|
||||
@@ -5,11 +5,10 @@ from typing import ClassVar, Iterator, Literal
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from agent_protocol_client import AgentApi, Step
|
||||
from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
|
||||
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, EvalResult
|
||||
from agent_protocol_client import AgentApi, Step
|
||||
from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
|
||||
|
||||
from .base import BaseChallenge, ChallengeInfo
|
||||
|
||||
@@ -84,11 +83,13 @@ def resolve_uri(uri: str) -> str:
|
||||
|
||||
class Eval(ABC):
|
||||
@abstractmethod
|
||||
def evaluate(self, string: str) -> bool: ...
|
||||
def evaluate(self, string: str) -> bool:
|
||||
...
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def description(self) -> str: ...
|
||||
def description(self) -> str:
|
||||
...
|
||||
|
||||
|
||||
class BaseStringEval(BaseModel, Eval):
|
||||
|
||||
@@ -88,10 +88,16 @@ class BenchmarkHarness:
|
||||
if progress.result:
|
||||
all_results[progress.config_name].append(progress.result)
|
||||
|
||||
# Create step callback if UI supports it
|
||||
step_callback = None
|
||||
if hasattr(ui, "log_step"):
|
||||
step_callback = ui.log_step
|
||||
|
||||
# Create executor
|
||||
executor = ParallelExecutor(
|
||||
max_parallel=self.config.max_parallel,
|
||||
on_progress=on_progress,
|
||||
on_step=step_callback,
|
||||
attempts=self.config.attempts,
|
||||
no_cutoff=self.config.no_cutoff,
|
||||
)
|
||||
@@ -101,7 +107,8 @@ class BenchmarkHarness:
|
||||
|
||||
# Run with or without live display
|
||||
if isinstance(ui, BenchmarkUI) and ui_mode == "default":
|
||||
with Live(ui.render_live_display(), console=console, refresh_per_second=4):
|
||||
# Pass the UI object itself so Live can refresh it
|
||||
with Live(ui, console=console, refresh_per_second=4):
|
||||
async for _ in executor.execute_matrix(
|
||||
self.config.configs,
|
||||
challenges,
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import AsyncIterator, Callable, Optional
|
||||
|
||||
from .evaluator import Evaluator
|
||||
from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress
|
||||
from .runner import AgentRunner
|
||||
from .runner import AgentRunner, StepCallback
|
||||
|
||||
|
||||
class ParallelExecutor:
|
||||
@@ -16,11 +16,13 @@ class ParallelExecutor:
|
||||
self,
|
||||
max_parallel: int = 4,
|
||||
on_progress: Optional[Callable[[ExecutionProgress], None]] = None,
|
||||
on_step: Optional[StepCallback] = None,
|
||||
attempts: int = 1,
|
||||
no_cutoff: bool = False,
|
||||
):
|
||||
self.max_parallel = max_parallel
|
||||
self.on_progress = on_progress
|
||||
self.on_step = on_step
|
||||
self.attempts = attempts
|
||||
self.no_cutoff = no_cutoff
|
||||
self._semaphore = asyncio.Semaphore(max_parallel)
|
||||
@@ -86,7 +88,12 @@ class ParallelExecutor:
|
||||
)
|
||||
|
||||
# Run the challenge (with modified timeout if no_cutoff is set)
|
||||
runner = AgentRunner(config, workspace_root, no_cutoff=self.no_cutoff)
|
||||
runner = AgentRunner(
|
||||
config,
|
||||
workspace_root,
|
||||
no_cutoff=self.no_cutoff,
|
||||
step_callback=self.on_step,
|
||||
)
|
||||
result = await runner.run_challenge(challenge, attempt=attempt)
|
||||
|
||||
# Evaluate result
|
||||
|
||||
@@ -5,17 +5,20 @@ import shutil
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from forge.file_storage import FileStorageBackendName, get_storage
|
||||
from forge.llm.providers import MultiProvider
|
||||
from typing import Callable, Optional
|
||||
|
||||
from autogpt.agent_factory.configurators import create_agent
|
||||
from autogpt.agents.agent import Agent
|
||||
from autogpt.app.config import AppConfig, ConfigBuilder
|
||||
from forge.file_storage import FileStorageBackendName, get_storage
|
||||
from forge.llm.providers import MultiProvider
|
||||
|
||||
from .models import BenchmarkConfig, Challenge, ChallengeResult, StepResult
|
||||
|
||||
# Type for step logging callback
|
||||
StepCallback = Callable[[str, str, int, str, str, bool], None]
|
||||
# Args: config_name, challenge_name, step_num, tool_name, result_preview, is_error
|
||||
|
||||
|
||||
class AgentRunner:
|
||||
"""Runs a single agent instance for a challenge."""
|
||||
@@ -25,10 +28,12 @@ class AgentRunner:
|
||||
config: BenchmarkConfig,
|
||||
workspace_root: Path,
|
||||
no_cutoff: bool = False,
|
||||
step_callback: Optional[StepCallback] = None,
|
||||
):
|
||||
self.config = config
|
||||
self.workspace_root = workspace_root
|
||||
self.no_cutoff = no_cutoff
|
||||
self.step_callback = step_callback
|
||||
self._agent: Optional[Agent] = None
|
||||
self._workspace: Optional[Path] = None
|
||||
|
||||
@@ -210,20 +215,42 @@ class AgentRunner:
|
||||
step_cost = 0.0 # TODO: Extract from LLM provider
|
||||
cumulative_cost += step_cost
|
||||
|
||||
# Get result info
|
||||
result_str = str(
|
||||
result.outputs if hasattr(result, "outputs") else result
|
||||
)
|
||||
is_error = hasattr(result, "status") and result.status == "error"
|
||||
|
||||
# Record step
|
||||
steps.append(
|
||||
StepResult(
|
||||
step_num=step_num + 1,
|
||||
tool_name=proposal.use_tool.name,
|
||||
tool_args=proposal.use_tool.arguments,
|
||||
result=str(
|
||||
result.outputs if hasattr(result, "outputs") else result
|
||||
),
|
||||
is_error=hasattr(result, "status") and result.status == "error",
|
||||
result=result_str,
|
||||
is_error=is_error,
|
||||
cumulative_cost=cumulative_cost,
|
||||
)
|
||||
)
|
||||
|
||||
# Call step callback if provided
|
||||
if self.step_callback:
|
||||
# Truncate result for display
|
||||
result_preview = (
|
||||
result_str[:100] + "..."
|
||||
if len(result_str) > 100
|
||||
else result_str
|
||||
)
|
||||
result_preview = result_preview.replace("\n", " ")
|
||||
self.step_callback(
|
||||
self.config.config_name,
|
||||
challenge.name,
|
||||
step_num + 1,
|
||||
proposal.use_tool.name,
|
||||
result_preview,
|
||||
is_error,
|
||||
)
|
||||
|
||||
return False # Hit max steps
|
||||
|
||||
# Run with or without timeout
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console, Group
|
||||
from rich.console import Console, Group, RenderableType
|
||||
from rich.live import Live
|
||||
from rich.panel import Panel
|
||||
from rich.progress import (
|
||||
@@ -22,6 +22,18 @@ from .models import ChallengeResult, ExecutionProgress
|
||||
|
||||
console = Console()
|
||||
|
||||
# Colors for different configs (cycling through for parallel runs)
|
||||
CONFIG_COLORS = [
|
||||
"cyan",
|
||||
"green",
|
||||
"yellow",
|
||||
"magenta",
|
||||
"blue",
|
||||
"red",
|
||||
"bright_cyan",
|
||||
"bright_green",
|
||||
]
|
||||
|
||||
|
||||
class BenchmarkUI:
|
||||
"""Rich UI for benchmark progress and results."""
|
||||
@@ -33,8 +45,11 @@ class BenchmarkUI:
|
||||
|
||||
# Track state
|
||||
self.active_runs: dict[str, str] = {} # config_name -> challenge_name
|
||||
self.active_steps: dict[str, str] = {} # config_name -> current step info
|
||||
self.completed: list[ChallengeResult] = []
|
||||
self.results_by_config: dict[str, list[ChallengeResult]] = {}
|
||||
self.recent_steps: list[tuple[str, str, int, str, bool]] = [] # Last N steps
|
||||
self.config_colors: dict[str, str] = {} # config_name -> color
|
||||
|
||||
# Progress tracking
|
||||
self.progress = Progress(
|
||||
@@ -53,17 +68,56 @@ class BenchmarkUI:
|
||||
self.start_time = datetime.now()
|
||||
self.total_challenges = total_challenges
|
||||
self.results_by_config = {config: [] for config in configs}
|
||||
# Assign colors to configs
|
||||
for i, config in enumerate(configs):
|
||||
self.config_colors[config] = CONFIG_COLORS[i % len(CONFIG_COLORS)]
|
||||
self.main_task = self.progress.add_task(
|
||||
"[cyan]Running benchmarks...", total=total_challenges
|
||||
)
|
||||
|
||||
def get_config_color(self, config_name: str) -> str:
|
||||
"""Get the assigned color for a config."""
|
||||
return self.config_colors.get(config_name, "white")
|
||||
|
||||
def log_step(
|
||||
self,
|
||||
config_name: str,
|
||||
challenge_name: str,
|
||||
step_num: int,
|
||||
tool_name: str,
|
||||
result_preview: str,
|
||||
is_error: bool,
|
||||
) -> None:
|
||||
"""Log a step execution (called from AgentRunner)."""
|
||||
# Update active step info
|
||||
self.active_steps[config_name] = f"step {step_num}: {tool_name}"
|
||||
|
||||
# Add to recent steps (keep last 10)
|
||||
self.recent_steps.append(
|
||||
(config_name, challenge_name, step_num, tool_name, is_error)
|
||||
)
|
||||
if len(self.recent_steps) > 10:
|
||||
self.recent_steps.pop(0)
|
||||
|
||||
# In verbose mode, print immediately
|
||||
if self.verbose:
|
||||
color = self.get_config_color(config_name)
|
||||
status = "[red]ERR[/red]" if is_error else "[green]OK[/green]"
|
||||
console.print(
|
||||
f"[{color}][{config_name}][/{color}] {challenge_name} "
|
||||
f"step {step_num}: {tool_name} {status}"
|
||||
)
|
||||
|
||||
def update(self, progress: ExecutionProgress) -> None:
|
||||
"""Update UI with execution progress."""
|
||||
if progress.status == "starting":
|
||||
self.active_runs[progress.config_name] = progress.challenge_name
|
||||
self.active_steps[progress.config_name] = "starting..."
|
||||
elif progress.status in ("completed", "failed"):
|
||||
if progress.config_name in self.active_runs:
|
||||
del self.active_runs[progress.config_name]
|
||||
if progress.config_name in self.active_steps:
|
||||
del self.active_steps[progress.config_name]
|
||||
if progress.result:
|
||||
self.completed.append(progress.result)
|
||||
self.results_by_config[progress.config_name].append(progress.result)
|
||||
@@ -84,17 +138,20 @@ class BenchmarkUI:
|
||||
def render_active_runs(self) -> Panel:
|
||||
"""Render panel showing active runs."""
|
||||
if not self.active_runs:
|
||||
content = Text("No active runs", style="dim")
|
||||
content = Text("Waiting for runs to start...", style="dim")
|
||||
else:
|
||||
lines = []
|
||||
for config_name, challenge_name in self.active_runs.items():
|
||||
color = self.get_config_color(config_name)
|
||||
step_info = self.active_steps.get(config_name, "")
|
||||
lines.append(
|
||||
Text.assemble(
|
||||
(" ", ""),
|
||||
("\u25cf ", "yellow"), # Bullet point
|
||||
(f"{config_name}", "cyan"),
|
||||
(f"{config_name}", color),
|
||||
(" \u2192 ", "dim"), # Arrow
|
||||
(challenge_name, "white"),
|
||||
(f" ({step_info})", "dim") if step_info else ("", ""),
|
||||
)
|
||||
)
|
||||
content = Group(*lines)
|
||||
@@ -164,6 +221,44 @@ class BenchmarkUI:
|
||||
border_style="green" if self.completed else "dim",
|
||||
)
|
||||
|
||||
def render_recent_steps(self) -> Panel:
|
||||
"""Render panel showing recent step executions."""
|
||||
if not self.recent_steps:
|
||||
content = Text("No steps yet", style="dim")
|
||||
else:
|
||||
lines = []
|
||||
for (
|
||||
config_name,
|
||||
challenge,
|
||||
step_num,
|
||||
tool_name,
|
||||
is_error,
|
||||
) in self.recent_steps[-5:]:
|
||||
color = self.get_config_color(config_name)
|
||||
status = (
|
||||
Text("\u2717", style="red")
|
||||
if is_error
|
||||
else Text("\u2713", style="green")
|
||||
)
|
||||
lines.append(
|
||||
Text.assemble(
|
||||
(" ", ""),
|
||||
status,
|
||||
(" ", ""),
|
||||
(f"[{config_name}]", color),
|
||||
(" ", ""),
|
||||
(f"{challenge} #{step_num}: ", "dim"),
|
||||
(tool_name, "white"),
|
||||
)
|
||||
)
|
||||
content = Group(*lines)
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title="[bold]Recent Steps[/bold]",
|
||||
border_style="dim",
|
||||
)
|
||||
|
||||
def render_live_display(self) -> Group:
|
||||
"""Render the full live display."""
|
||||
return Group(
|
||||
@@ -171,9 +266,15 @@ class BenchmarkUI:
|
||||
"",
|
||||
self.render_active_runs(),
|
||||
"",
|
||||
self.render_recent_steps(),
|
||||
"",
|
||||
self.render_recent_completions(),
|
||||
)
|
||||
|
||||
def __rich_console__(self, console: Console, options) -> RenderableType:
|
||||
"""Support for Rich Live display - called on each refresh."""
|
||||
yield self.render_live_display()
|
||||
|
||||
def print_final_summary(self) -> None:
|
||||
"""Print final summary after all benchmarks complete."""
|
||||
elapsed = (
|
||||
|
||||
Reference in New Issue
Block a user