feat(direct_benchmark): add step-level logging with colored prefixes

- Add step callback to AgentRunner for real-time step logging - BenchmarkUI now shows: - Active runs with current step info - Recent steps panel with colored config prefixes - Proper Live display refresh (implements __rich_console__) - Each config gets a distinct color for easy identification - Verbose mode prints step logs immediately with config prefix - Fix Live display not updating (pass UI object, not rendered content) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-12 07:45:14 -05:00 · 2026-01-19 23:02:20 -06:00
parent ab95077e5b
commit 0a616d9267
9 changed files with 198 additions and 142 deletions
--- a/classic/direct_benchmark/challenges/base.py
+++ b/classic/direct_benchmark/challenges/base.py
@@ -4,13 +4,12 @@ from pathlib import Path
 from typing import Any, AsyncIterator, Awaitable, ClassVar, Optional

 import pytest
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
 from agent_protocol_client import AgentApi, Step
 from colorama import Fore, Style
 from pydantic import BaseModel, Field

-from agbenchmark.config import AgentBenchmarkConfig
-from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
-
 logger = logging.getLogger(__name__)


@@ -183,4 +182,5 @@ class BaseChallenge(ABC):
    @abstractmethod
    async def evaluate_task_state(
        cls, agent: AgentApi, task_id: str
-    ) -> list[EvalResult]: ...
+    ) -> list[EvalResult]:
+        ...
--- a/classic/direct_benchmark/challenges/builtin.py
+++ b/classic/direct_benchmark/challenges/builtin.py
@@ -10,6 +10,16 @@ from pathlib import Path
 from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional

 import pytest
+from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
+from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
+from agbenchmark.utils.prompts import (
+    END_PROMPT,
+    FEW_SHOT_EXAMPLES,
+    PROMPT_MAP,
+    SCORING_MAP,
+)
 from agent_protocol_client import AgentApi, ApiClient
 from agent_protocol_client import Configuration as ClientConfig
 from agent_protocol_client import Step
@@ -23,17 +33,6 @@ from pydantic import (
    field_validator,
 )

-from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
-from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
-from agbenchmark.config import AgentBenchmarkConfig
-from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
-from agbenchmark.utils.prompts import (
-    END_PROMPT,
-    FEW_SHOT_EXAMPLES,
-    PROMPT_MAP,
-    SCORING_MAP,
-)
-
 from .base import BaseChallenge, ChallengeInfo

 logger = logging.getLogger(__name__)
@@ -69,9 +68,9 @@ class BuiltinChallengeSpec(BaseModel):
        class Eval(BaseModel):
            type: str
            scoring: Optional[Literal["percentage", "scale", "binary"]] = None
-            template: Optional[Literal["rubric", "reference", "question", "custom"]] = (
-                None
-            )
+            template: Optional[
+                Literal["rubric", "reference", "question", "custom"]
+            ] = None
            examples: Optional[str] = None

            @field_validator("scoring", "template")
--- a/classic/direct_benchmark/challenges/webarena.py
+++ b/classic/direct_benchmark/challenges/webarena.py
@@ -5,11 +5,10 @@ from typing import ClassVar, Iterator, Literal

 import pytest
 import requests
-from agent_protocol_client import AgentApi, Step
-from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
-
 from agbenchmark.config import AgentBenchmarkConfig
 from agbenchmark.utils.data_types import Category, EvalResult
+from agent_protocol_client import AgentApi, Step
+from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator

 from .base import BaseChallenge, ChallengeInfo

@@ -84,11 +83,13 @@ def resolve_uri(uri: str) -> str:

 class Eval(ABC):
    @abstractmethod
-    def evaluate(self, string: str) -> bool: ...
+    def evaluate(self, string: str) -> bool:
+        ...

    @property
    @abstractmethod
-    def description(self) -> str: ...
+    def description(self) -> str:
+        ...


 class BaseStringEval(BaseModel, Eval):
--- a/classic/direct_benchmark/direct_benchmark/harness.py
+++ b/classic/direct_benchmark/direct_benchmark/harness.py
@@ -88,10 +88,16 @@ class BenchmarkHarness:
            if progress.result:
                all_results[progress.config_name].append(progress.result)

+        # Create step callback if UI supports it
+        step_callback = None
+        if hasattr(ui, "log_step"):
+            step_callback = ui.log_step
+
        # Create executor
        executor = ParallelExecutor(
            max_parallel=self.config.max_parallel,
            on_progress=on_progress,
+            on_step=step_callback,
            attempts=self.config.attempts,
            no_cutoff=self.config.no_cutoff,
        )
@@ -101,7 +107,8 @@ class BenchmarkHarness:

        # Run with or without live display
        if isinstance(ui, BenchmarkUI) and ui_mode == "default":
-            with Live(ui.render_live_display(), console=console, refresh_per_second=4):
+            # Pass the UI object itself so Live can refresh it
+            with Live(ui, console=console, refresh_per_second=4):
                async for _ in executor.execute_matrix(
                    self.config.configs,
                    challenges,
--- a/classic/direct_benchmark/direct_benchmark/parallel.py
+++ b/classic/direct_benchmark/direct_benchmark/parallel.py
@@ -6,7 +6,7 @@ from typing import AsyncIterator, Callable, Optional

 from .evaluator import Evaluator
 from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress
-from .runner import AgentRunner
+from .runner import AgentRunner, StepCallback


 class ParallelExecutor:
@@ -16,11 +16,13 @@ class ParallelExecutor:
        self,
        max_parallel: int = 4,
        on_progress: Optional[Callable[[ExecutionProgress], None]] = None,
+        on_step: Optional[StepCallback] = None,
        attempts: int = 1,
        no_cutoff: bool = False,
    ):
        self.max_parallel = max_parallel
        self.on_progress = on_progress
+        self.on_step = on_step
        self.attempts = attempts
        self.no_cutoff = no_cutoff
        self._semaphore = asyncio.Semaphore(max_parallel)
@@ -86,7 +88,12 @@ class ParallelExecutor:
                )

            # Run the challenge (with modified timeout if no_cutoff is set)
-            runner = AgentRunner(config, workspace_root, no_cutoff=self.no_cutoff)
+            runner = AgentRunner(
+                config,
+                workspace_root,
+                no_cutoff=self.no_cutoff,
+                step_callback=self.on_step,
+            )
            result = await runner.run_challenge(challenge, attempt=attempt)

            # Evaluate result
--- a/classic/direct_benchmark/direct_benchmark/runner.py
+++ b/classic/direct_benchmark/direct_benchmark/runner.py
@@ -5,17 +5,20 @@ import shutil
 import tempfile
 from datetime import datetime
 from pathlib import Path
-from typing import Optional
-
-from forge.file_storage import FileStorageBackendName, get_storage
-from forge.llm.providers import MultiProvider
+from typing import Callable, Optional

 from autogpt.agent_factory.configurators import create_agent
 from autogpt.agents.agent import Agent
 from autogpt.app.config import AppConfig, ConfigBuilder
+from forge.file_storage import FileStorageBackendName, get_storage
+from forge.llm.providers import MultiProvider

 from .models import BenchmarkConfig, Challenge, ChallengeResult, StepResult

+# Type for step logging callback
+StepCallback = Callable[[str, str, int, str, str, bool], None]
+# Args: config_name, challenge_name, step_num, tool_name, result_preview, is_error
+

 class AgentRunner:
    """Runs a single agent instance for a challenge."""
@@ -25,10 +28,12 @@ class AgentRunner:
        config: BenchmarkConfig,
        workspace_root: Path,
        no_cutoff: bool = False,
+        step_callback: Optional[StepCallback] = None,
    ):
        self.config = config
        self.workspace_root = workspace_root
        self.no_cutoff = no_cutoff
+        self.step_callback = step_callback
        self._agent: Optional[Agent] = None
        self._workspace: Optional[Path] = None

@@ -210,20 +215,42 @@ class AgentRunner:
                step_cost = 0.0  # TODO: Extract from LLM provider
                cumulative_cost += step_cost

+                # Get result info
+                result_str = str(
+                    result.outputs if hasattr(result, "outputs") else result
+                )
+                is_error = hasattr(result, "status") and result.status == "error"
+
                # Record step
                steps.append(
                    StepResult(
                        step_num=step_num + 1,
                        tool_name=proposal.use_tool.name,
                        tool_args=proposal.use_tool.arguments,
-                        result=str(
-                            result.outputs if hasattr(result, "outputs") else result
-                        ),
-                        is_error=hasattr(result, "status") and result.status == "error",
+                        result=result_str,
+                        is_error=is_error,
                        cumulative_cost=cumulative_cost,
                    )
                )

+                # Call step callback if provided
+                if self.step_callback:
+                    # Truncate result for display
+                    result_preview = (
+                        result_str[:100] + "..."
+                        if len(result_str) > 100
+                        else result_str
+                    )
+                    result_preview = result_preview.replace("\n", " ")
+                    self.step_callback(
+                        self.config.config_name,
+                        challenge.name,
+                        step_num + 1,
+                        proposal.use_tool.name,
+                        result_preview,
+                        is_error,
+                    )
+
            return False  # Hit max steps

        # Run with or without timeout
--- a/classic/direct_benchmark/direct_benchmark/ui.py
+++ b/classic/direct_benchmark/direct_benchmark/ui.py
@@ -3,7 +3,7 @@
 from datetime import datetime
 from typing import Optional

-from rich.console import Console, Group
+from rich.console import Console, Group, RenderableType
 from rich.live import Live
 from rich.panel import Panel
 from rich.progress import (
@@ -22,6 +22,18 @@ from .models import ChallengeResult, ExecutionProgress

 console = Console()

+# Colors for different configs (cycling through for parallel runs)
+CONFIG_COLORS = [
+    "cyan",
+    "green",
+    "yellow",
+    "magenta",
+    "blue",
+    "red",
+    "bright_cyan",
+    "bright_green",
+]
+

 class BenchmarkUI:
    """Rich UI for benchmark progress and results."""
@@ -33,8 +45,11 @@ class BenchmarkUI:

        # Track state
        self.active_runs: dict[str, str] = {}  # config_name -> challenge_name
+        self.active_steps: dict[str, str] = {}  # config_name -> current step info
        self.completed: list[ChallengeResult] = []
        self.results_by_config: dict[str, list[ChallengeResult]] = {}
+        self.recent_steps: list[tuple[str, str, int, str, bool]] = []  # Last N steps
+        self.config_colors: dict[str, str] = {}  # config_name -> color

        # Progress tracking
        self.progress = Progress(
@@ -53,17 +68,56 @@ class BenchmarkUI:
        self.start_time = datetime.now()
        self.total_challenges = total_challenges
        self.results_by_config = {config: [] for config in configs}
+        # Assign colors to configs
+        for i, config in enumerate(configs):
+            self.config_colors[config] = CONFIG_COLORS[i % len(CONFIG_COLORS)]
        self.main_task = self.progress.add_task(
            "[cyan]Running benchmarks...", total=total_challenges
        )

+    def get_config_color(self, config_name: str) -> str:
+        """Get the assigned color for a config."""
+        return self.config_colors.get(config_name, "white")
+
+    def log_step(
+        self,
+        config_name: str,
+        challenge_name: str,
+        step_num: int,
+        tool_name: str,
+        result_preview: str,
+        is_error: bool,
+    ) -> None:
+        """Log a step execution (called from AgentRunner)."""
+        # Update active step info
+        self.active_steps[config_name] = f"step {step_num}: {tool_name}"
+
+        # Add to recent steps (keep last 10)
+        self.recent_steps.append(
+            (config_name, challenge_name, step_num, tool_name, is_error)
+        )
+        if len(self.recent_steps) > 10:
+            self.recent_steps.pop(0)
+
+        # In verbose mode, print immediately
+        if self.verbose:
+            color = self.get_config_color(config_name)
+            status = "[red]ERR[/red]" if is_error else "[green]OK[/green]"
+            console.print(
+                f"[{color}][{config_name}][/{color}] {challenge_name} "
+                f"step {step_num}: {tool_name} {status}"
+            )
+
    def update(self, progress: ExecutionProgress) -> None:
        """Update UI with execution progress."""
        if progress.status == "starting":
            self.active_runs[progress.config_name] = progress.challenge_name
+            self.active_steps[progress.config_name] = "starting..."
        elif progress.status in ("completed", "failed"):
            if progress.config_name in self.active_runs:
                del self.active_runs[progress.config_name]
+            if progress.config_name in self.active_steps:
+                del self.active_steps[progress.config_name]
            if progress.result:
                self.completed.append(progress.result)
                self.results_by_config[progress.config_name].append(progress.result)
@@ -84,17 +138,20 @@ class BenchmarkUI:
    def render_active_runs(self) -> Panel:
        """Render panel showing active runs."""
        if not self.active_runs:
-            content = Text("No active runs", style="dim")
+            content = Text("Waiting for runs to start...", style="dim")
        else:
            lines = []
            for config_name, challenge_name in self.active_runs.items():
+                color = self.get_config_color(config_name)
+                step_info = self.active_steps.get(config_name, "")
                lines.append(
                    Text.assemble(
                        ("  ", ""),
                        ("\u25cf ", "yellow"),  # Bullet point
-                        (f"{config_name}", "cyan"),
+                        (f"{config_name}", color),
                        (" \u2192 ", "dim"),  # Arrow
                        (challenge_name, "white"),
+                        (f" ({step_info})", "dim") if step_info else ("", ""),
                    )
                )
            content = Group(*lines)
@@ -164,6 +221,44 @@ class BenchmarkUI:
            border_style="green" if self.completed else "dim",
        )

+    def render_recent_steps(self) -> Panel:
+        """Render panel showing recent step executions."""
+        if not self.recent_steps:
+            content = Text("No steps yet", style="dim")
+        else:
+            lines = []
+            for (
+                config_name,
+                challenge,
+                step_num,
+                tool_name,
+                is_error,
+            ) in self.recent_steps[-5:]:
+                color = self.get_config_color(config_name)
+                status = (
+                    Text("\u2717", style="red")
+                    if is_error
+                    else Text("\u2713", style="green")
+                )
+                lines.append(
+                    Text.assemble(
+                        ("  ", ""),
+                        status,
+                        (" ", ""),
+                        (f"[{config_name}]", color),
+                        (" ", ""),
+                        (f"{challenge} #{step_num}: ", "dim"),
+                        (tool_name, "white"),
+                    )
+                )
+            content = Group(*lines)
+
+        return Panel(
+            content,
+            title="[bold]Recent Steps[/bold]",
+            border_style="dim",
+        )
+
    def render_live_display(self) -> Group:
        """Render the full live display."""
        return Group(
@@ -171,9 +266,15 @@ class BenchmarkUI:
            "",
            self.render_active_runs(),
            "",
+            self.render_recent_steps(),
+            "",
            self.render_recent_completions(),
        )

+    def __rich_console__(self, console: Console, options) -> RenderableType:
+        """Support for Rich Live display - called on each refresh."""
+        yield self.render_live_display()
+
    def print_final_summary(self) -> None:
        """Print final summary after all benchmarks complete."""
        elapsed = (