diff --git a/.github/workflows/classic-autogpt-ci.yml b/.github/workflows/classic-autogpt-ci.yml
index 41ea0a9dfd..3a1b5c42bf 100644
--- a/.github/workflows/classic-autogpt-ci.yml
+++ b/.github/workflows/classic-autogpt-ci.yml
@@ -30,40 +30,15 @@ jobs:
     permissions:
       contents: read
     timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.12", "3.13", "3.14"]
-        platform-os: [ubuntu, macos, macos-arm64, windows]
-    runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
+    runs-on: ubuntu-latest
 
     steps:
-      # Quite slow on macOS (2~4 minutes to set up Docker)
-      # - name: Set up Docker (macOS)
-      #   if: runner.os == 'macOS'
-      #   uses: crazy-max/ghaction-setup-docker@v3
-
-      - name: Start MinIO service (Linux)
-        if: runner.os == 'Linux'
+      - name: Start MinIO service
         working-directory: '.'
         run: |
           docker pull minio/minio:edge-cicd
           docker run -d -p 9000:9000 minio/minio:edge-cicd
 
-      - name: Start MinIO service (macOS)
-        if: runner.os == 'macOS'
-        working-directory: ${{ runner.temp }}
-        run: |
-          brew install minio/stable/minio
-          mkdir data
-          minio server ./data &
-
-      # No MinIO on Windows:
-      # - Windows doesn't support running Linux Docker containers
-      # - It doesn't seem possible to start background processes on Windows. They are
-      #   killed after the step returns.
-      #   See: https://github.com/actions/runner/issues/598#issuecomment-2011890429
-
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
@@ -75,41 +50,23 @@ jobs:
           git config --global user.name "Auto-GPT-Bot"
           git config --global user.email "github-bot@agpt.co"
 
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: "3.12"
 
       - id: get_date
         name: Get date
         run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
 
       - name: Set up Python dependency cache
-        # On Windows, unpacking cached dependencies takes longer than just installing them
-        if: runner.os != 'Windows'
         uses: actions/cache@v4
         with:
-          path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
+          path: ~/.cache/pypoetry
           key: poetry-${{ runner.os }}-${{ hashFiles('classic/original_autogpt/poetry.lock') }}
 
-      - name: Install Poetry (Unix)
-        if: runner.os != 'Windows'
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-
-          if [ "${{ runner.os }}" = "macOS" ]; then
-            PATH="$HOME/.local/bin:$PATH"
-            echo "$HOME/.local/bin" >> $GITHUB_PATH
-          fi
-
-      - name: Install Poetry (Windows)
-        if: runner.os == 'Windows'
-        shell: pwsh
-        run: |
-          (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
-
-          $env:PATH += ";$env:APPDATA\Python\Scripts"
-          echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH
+      - name: Install Poetry
+        run: curl -sSL https://install.python-poetry.org | python3 -
 
       - name: Install Python dependencies
         run: poetry install
@@ -129,7 +86,7 @@ jobs:
           CI: true
           PLAIN_OUTPUT: True
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }}
+          S3_ENDPOINT_URL: http://127.0.0.1:9000
           AWS_ACCESS_KEY_ID: minioadmin
           AWS_SECRET_ACCESS_KEY: minioadmin
 
@@ -143,7 +100,7 @@ jobs:
         uses: codecov/codecov-action@v5
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
-          flags: autogpt-agent,${{ runner.os }}
+          flags: autogpt-agent
 
       - name: Upload logs to artifact
         if: always()
diff --git a/.github/workflows/classic-forge-ci.yml b/.github/workflows/classic-forge-ci.yml
index df1431f844..6ee9ab81ed 100644
--- a/.github/workflows/classic-forge-ci.yml
+++ b/.github/workflows/classic-forge-ci.yml
@@ -26,74 +26,31 @@ jobs:
     permissions:
       contents: read
     timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.12", "3.13", "3.14"]
-        platform-os: [ubuntu, macos, macos-arm64, windows]
-    runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
+    runs-on: ubuntu-latest
 
     steps:
-      # Quite slow on macOS (2~4 minutes to set up Docker)
-      # - name: Set up Docker (macOS)
-      #   if: runner.os == 'macOS'
-      #   uses: crazy-max/ghaction-setup-docker@v3
-
-      - name: Start MinIO service (Linux)
-        if: runner.os == 'Linux'
+      - name: Start MinIO service
         working-directory: '.'
         run: |
           docker pull minio/minio:edge-cicd
           docker run -d -p 9000:9000 minio/minio:edge-cicd
 
-      - name: Start MinIO service (macOS)
-        if: runner.os == 'macOS'
-        working-directory: ${{ runner.temp }}
-        run: |
-          brew install minio/stable/minio
-          mkdir data
-          minio server ./data &
-
-      # No MinIO on Windows:
-      # - Windows doesn't support running Linux Docker containers
-      # - It doesn't seem possible to start background processes on Windows. They are
-      #   killed after the step returns.
-      #   See: https://github.com/actions/runner/issues/598#issuecomment-2011890429
-
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: "3.12"
 
       - name: Set up Python dependency cache
-        # On Windows, unpacking cached dependencies takes longer than just installing them
-        if: runner.os != 'Windows'
         uses: actions/cache@v4
         with:
-          path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
+          path: ~/.cache/pypoetry
           key: poetry-${{ runner.os }}-${{ hashFiles('classic/forge/poetry.lock') }}
 
-      - name: Install Poetry (Unix)
-        if: runner.os != 'Windows'
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-
-          if [ "${{ runner.os }}" = "macOS" ]; then
-            PATH="$HOME/.local/bin:$PATH"
-            echo "$HOME/.local/bin" >> $GITHUB_PATH
-          fi
-
-      - name: Install Poetry (Windows)
-        if: runner.os == 'Windows'
-        shell: pwsh
-        run: |
-          (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
-
-          $env:PATH += ";$env:APPDATA\Python\Scripts"
-          echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH
+      - name: Install Poetry
+        run: curl -sSL https://install.python-poetry.org | python3 -
 
       - name: Install Python dependencies
         run: poetry install
@@ -112,7 +69,7 @@ jobs:
           # Secrets are not available to fork PRs (GitHub security feature)
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }}
+          S3_ENDPOINT_URL: http://127.0.0.1:9000
           AWS_ACCESS_KEY_ID: minioadmin
           AWS_SECRET_ACCESS_KEY: minioadmin
 
@@ -126,11 +83,11 @@ jobs:
         uses: codecov/codecov-action@v5
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
-          flags: forge,${{ runner.os }}
+          flags: forge
 
       - name: Upload logs to artifact
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: test-logs-${{ matrix.platform-os }}-${{ matrix.python-version }}
+          name: test-logs
           path: classic/forge/logs/
diff --git a/classic/direct_benchmark/challenges/base.py b/classic/direct_benchmark/challenges/base.py
index c3c5cdf7de..ff73706627 100644
--- a/classic/direct_benchmark/challenges/base.py
+++ b/classic/direct_benchmark/challenges/base.py
@@ -4,13 +4,12 @@ from pathlib import Path
 from typing import Any, AsyncIterator, Awaitable, ClassVar, Optional
 
 import pytest
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
 from agent_protocol_client import AgentApi, Step
 from colorama import Fore, Style
 from pydantic import BaseModel, Field
 
-from agbenchmark.config import AgentBenchmarkConfig
-from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
-
 logger = logging.getLogger(__name__)
 
 
@@ -183,4 +182,5 @@ class BaseChallenge(ABC):
     @abstractmethod
     async def evaluate_task_state(
         cls, agent: AgentApi, task_id: str
-    ) -> list[EvalResult]: ...
+    ) -> list[EvalResult]:
+        ...
diff --git a/classic/direct_benchmark/challenges/builtin.py b/classic/direct_benchmark/challenges/builtin.py
index d1e0418818..feb2ed8aad 100644
--- a/classic/direct_benchmark/challenges/builtin.py
+++ b/classic/direct_benchmark/challenges/builtin.py
@@ -10,6 +10,16 @@ from pathlib import Path
 from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional
 
 import pytest
+from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
+from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
+from agbenchmark.utils.prompts import (
+    END_PROMPT,
+    FEW_SHOT_EXAMPLES,
+    PROMPT_MAP,
+    SCORING_MAP,
+)
 from agent_protocol_client import AgentApi, ApiClient
 from agent_protocol_client import Configuration as ClientConfig
 from agent_protocol_client import Step
@@ -23,17 +33,6 @@ from pydantic import (
     field_validator,
 )
 
-from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
-from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
-from agbenchmark.config import AgentBenchmarkConfig
-from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
-from agbenchmark.utils.prompts import (
-    END_PROMPT,
-    FEW_SHOT_EXAMPLES,
-    PROMPT_MAP,
-    SCORING_MAP,
-)
-
 from .base import BaseChallenge, ChallengeInfo
 
 logger = logging.getLogger(__name__)
@@ -69,9 +68,9 @@ class BuiltinChallengeSpec(BaseModel):
         class Eval(BaseModel):
             type: str
             scoring: Optional[Literal["percentage", "scale", "binary"]] = None
-            template: Optional[Literal["rubric", "reference", "question", "custom"]] = (
-                None
-            )
+            template: Optional[
+                Literal["rubric", "reference", "question", "custom"]
+            ] = None
             examples: Optional[str] = None
 
             @field_validator("scoring", "template")
diff --git a/classic/direct_benchmark/challenges/webarena.py b/classic/direct_benchmark/challenges/webarena.py
index be8a300be8..e1e8c049cb 100644
--- a/classic/direct_benchmark/challenges/webarena.py
+++ b/classic/direct_benchmark/challenges/webarena.py
@@ -5,11 +5,10 @@ from typing import ClassVar, Iterator, Literal
 
 import pytest
 import requests
-from agent_protocol_client import AgentApi, Step
-from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
-
 from agbenchmark.config import AgentBenchmarkConfig
 from agbenchmark.utils.data_types import Category, EvalResult
+from agent_protocol_client import AgentApi, Step
+from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
 
 from .base import BaseChallenge, ChallengeInfo
 
@@ -84,11 +83,13 @@ def resolve_uri(uri: str) -> str:
 
 class Eval(ABC):
     @abstractmethod
-    def evaluate(self, string: str) -> bool: ...
+    def evaluate(self, string: str) -> bool:
+        ...
 
     @property
     @abstractmethod
-    def description(self) -> str: ...
+    def description(self) -> str:
+        ...
 
 
 class BaseStringEval(BaseModel, Eval):
diff --git a/classic/direct_benchmark/direct_benchmark/harness.py b/classic/direct_benchmark/direct_benchmark/harness.py
index 3e265f4dcc..df94caf439 100644
--- a/classic/direct_benchmark/direct_benchmark/harness.py
+++ b/classic/direct_benchmark/direct_benchmark/harness.py
@@ -88,10 +88,16 @@ class BenchmarkHarness:
             if progress.result:
                 all_results[progress.config_name].append(progress.result)
 
+        # Create step callback if UI supports it
+        step_callback = None
+        if hasattr(ui, "log_step"):
+            step_callback = ui.log_step
+
         # Create executor
         executor = ParallelExecutor(
             max_parallel=self.config.max_parallel,
             on_progress=on_progress,
+            on_step=step_callback,
             attempts=self.config.attempts,
             no_cutoff=self.config.no_cutoff,
         )
@@ -101,7 +107,8 @@ class BenchmarkHarness:
 
         # Run with or without live display
         if isinstance(ui, BenchmarkUI) and ui_mode == "default":
-            with Live(ui.render_live_display(), console=console, refresh_per_second=4):
+            # Pass the UI object itself so Live can refresh it
+            with Live(ui, console=console, refresh_per_second=4):
                 async for _ in executor.execute_matrix(
                     self.config.configs,
                     challenges,
diff --git a/classic/direct_benchmark/direct_benchmark/parallel.py b/classic/direct_benchmark/direct_benchmark/parallel.py
index 954508a6e8..9f2dcc0ed9 100644
--- a/classic/direct_benchmark/direct_benchmark/parallel.py
+++ b/classic/direct_benchmark/direct_benchmark/parallel.py
@@ -6,7 +6,7 @@ from typing import AsyncIterator, Callable, Optional
 
 from .evaluator import Evaluator
 from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress
-from .runner import AgentRunner
+from .runner import AgentRunner, StepCallback
 
 
 class ParallelExecutor:
@@ -16,11 +16,13 @@ class ParallelExecutor:
         self,
         max_parallel: int = 4,
         on_progress: Optional[Callable[[ExecutionProgress], None]] = None,
+        on_step: Optional[StepCallback] = None,
         attempts: int = 1,
         no_cutoff: bool = False,
     ):
         self.max_parallel = max_parallel
         self.on_progress = on_progress
+        self.on_step = on_step
         self.attempts = attempts
         self.no_cutoff = no_cutoff
         self._semaphore = asyncio.Semaphore(max_parallel)
@@ -86,7 +88,12 @@ class ParallelExecutor:
                 )
 
             # Run the challenge (with modified timeout if no_cutoff is set)
-            runner = AgentRunner(config, workspace_root, no_cutoff=self.no_cutoff)
+            runner = AgentRunner(
+                config,
+                workspace_root,
+                no_cutoff=self.no_cutoff,
+                step_callback=self.on_step,
+            )
             result = await runner.run_challenge(challenge, attempt=attempt)
 
             # Evaluate result
diff --git a/classic/direct_benchmark/direct_benchmark/runner.py b/classic/direct_benchmark/direct_benchmark/runner.py
index 6da689c886..871b5e223d 100644
--- a/classic/direct_benchmark/direct_benchmark/runner.py
+++ b/classic/direct_benchmark/direct_benchmark/runner.py
@@ -5,17 +5,20 @@ import shutil
 import tempfile
 from datetime import datetime
 from pathlib import Path
-from typing import Optional
-
-from forge.file_storage import FileStorageBackendName, get_storage
-from forge.llm.providers import MultiProvider
+from typing import Callable, Optional
 
 from autogpt.agent_factory.configurators import create_agent
 from autogpt.agents.agent import Agent
 from autogpt.app.config import AppConfig, ConfigBuilder
+from forge.file_storage import FileStorageBackendName, get_storage
+from forge.llm.providers import MultiProvider
 
 from .models import BenchmarkConfig, Challenge, ChallengeResult, StepResult
 
+# Type for step logging callback
+StepCallback = Callable[[str, str, int, str, str, bool], None]
+# Args: config_name, challenge_name, step_num, tool_name, result_preview, is_error
+
 
 class AgentRunner:
     """Runs a single agent instance for a challenge."""
@@ -25,10 +28,12 @@ class AgentRunner:
         config: BenchmarkConfig,
         workspace_root: Path,
         no_cutoff: bool = False,
+        step_callback: Optional[StepCallback] = None,
     ):
         self.config = config
         self.workspace_root = workspace_root
         self.no_cutoff = no_cutoff
+        self.step_callback = step_callback
         self._agent: Optional[Agent] = None
         self._workspace: Optional[Path] = None
 
@@ -210,20 +215,42 @@ class AgentRunner:
                 step_cost = 0.0  # TODO: Extract from LLM provider
                 cumulative_cost += step_cost
 
+                # Get result info
+                result_str = str(
+                    result.outputs if hasattr(result, "outputs") else result
+                )
+                is_error = hasattr(result, "status") and result.status == "error"
+
                 # Record step
                 steps.append(
                     StepResult(
                         step_num=step_num + 1,
                         tool_name=proposal.use_tool.name,
                         tool_args=proposal.use_tool.arguments,
-                        result=str(
-                            result.outputs if hasattr(result, "outputs") else result
-                        ),
-                        is_error=hasattr(result, "status") and result.status == "error",
+                        result=result_str,
+                        is_error=is_error,
                         cumulative_cost=cumulative_cost,
                     )
                 )
 
+                # Call step callback if provided
+                if self.step_callback:
+                    # Truncate result for display
+                    result_preview = (
+                        result_str[:100] + "..."
+                        if len(result_str) > 100
+                        else result_str
+                    )
+                    result_preview = result_preview.replace("\n", " ")
+                    self.step_callback(
+                        self.config.config_name,
+                        challenge.name,
+                        step_num + 1,
+                        proposal.use_tool.name,
+                        result_preview,
+                        is_error,
+                    )
+
             return False  # Hit max steps
 
         # Run with or without timeout
diff --git a/classic/direct_benchmark/direct_benchmark/ui.py b/classic/direct_benchmark/direct_benchmark/ui.py
index 2cc09976d2..df4cc97fa5 100644
--- a/classic/direct_benchmark/direct_benchmark/ui.py
+++ b/classic/direct_benchmark/direct_benchmark/ui.py
@@ -3,7 +3,7 @@
 from datetime import datetime
 from typing import Optional
 
-from rich.console import Console, Group
+from rich.console import Console, Group, RenderableType
 from rich.live import Live
 from rich.panel import Panel
 from rich.progress import (
@@ -22,6 +22,18 @@ from .models import ChallengeResult, ExecutionProgress
 
 console = Console()
 
+# Colors for different configs (cycling through for parallel runs)
+CONFIG_COLORS = [
+    "cyan",
+    "green",
+    "yellow",
+    "magenta",
+    "blue",
+    "red",
+    "bright_cyan",
+    "bright_green",
+]
+
 
 class BenchmarkUI:
     """Rich UI for benchmark progress and results."""
@@ -33,8 +45,11 @@ class BenchmarkUI:
 
         # Track state
         self.active_runs: dict[str, str] = {}  # config_name -> challenge_name
+        self.active_steps: dict[str, str] = {}  # config_name -> current step info
         self.completed: list[ChallengeResult] = []
         self.results_by_config: dict[str, list[ChallengeResult]] = {}
+        self.recent_steps: list[tuple[str, str, int, str, bool]] = []  # Last N steps
+        self.config_colors: dict[str, str] = {}  # config_name -> color
 
         # Progress tracking
         self.progress = Progress(
@@ -53,17 +68,56 @@ class BenchmarkUI:
         self.start_time = datetime.now()
         self.total_challenges = total_challenges
         self.results_by_config = {config: [] for config in configs}
+        # Assign colors to configs
+        for i, config in enumerate(configs):
+            self.config_colors[config] = CONFIG_COLORS[i % len(CONFIG_COLORS)]
         self.main_task = self.progress.add_task(
             "[cyan]Running benchmarks...", total=total_challenges
         )
 
+    def get_config_color(self, config_name: str) -> str:
+        """Get the assigned color for a config."""
+        return self.config_colors.get(config_name, "white")
+
+    def log_step(
+        self,
+        config_name: str,
+        challenge_name: str,
+        step_num: int,
+        tool_name: str,
+        result_preview: str,
+        is_error: bool,
+    ) -> None:
+        """Log a step execution (called from AgentRunner)."""
+        # Update active step info
+        self.active_steps[config_name] = f"step {step_num}: {tool_name}"
+
+        # Add to recent steps (keep last 10)
+        self.recent_steps.append(
+            (config_name, challenge_name, step_num, tool_name, is_error)
+        )
+        if len(self.recent_steps) > 10:
+            self.recent_steps.pop(0)
+
+        # In verbose mode, print immediately
+        if self.verbose:
+            color = self.get_config_color(config_name)
+            status = "[red]ERR[/red]" if is_error else "[green]OK[/green]"
+            console.print(
+                f"[{color}][{config_name}][/{color}] {challenge_name} "
+                f"step {step_num}: {tool_name} {status}"
+            )
+
     def update(self, progress: ExecutionProgress) -> None:
         """Update UI with execution progress."""
         if progress.status == "starting":
             self.active_runs[progress.config_name] = progress.challenge_name
+            self.active_steps[progress.config_name] = "starting..."
         elif progress.status in ("completed", "failed"):
             if progress.config_name in self.active_runs:
                 del self.active_runs[progress.config_name]
+            if progress.config_name in self.active_steps:
+                del self.active_steps[progress.config_name]
             if progress.result:
                 self.completed.append(progress.result)
                 self.results_by_config[progress.config_name].append(progress.result)
@@ -84,17 +138,20 @@ class BenchmarkUI:
     def render_active_runs(self) -> Panel:
         """Render panel showing active runs."""
         if not self.active_runs:
-            content = Text("No active runs", style="dim")
+            content = Text("Waiting for runs to start...", style="dim")
         else:
             lines = []
             for config_name, challenge_name in self.active_runs.items():
+                color = self.get_config_color(config_name)
+                step_info = self.active_steps.get(config_name, "")
                 lines.append(
                     Text.assemble(
                         ("  ", ""),
                         ("\u25cf ", "yellow"),  # Bullet point
-                        (f"{config_name}", "cyan"),
+                        (f"{config_name}", color),
                         (" \u2192 ", "dim"),  # Arrow
                         (challenge_name, "white"),
+                        (f" ({step_info})", "dim") if step_info else ("", ""),
                     )
                 )
             content = Group(*lines)
@@ -164,6 +221,44 @@ class BenchmarkUI:
             border_style="green" if self.completed else "dim",
         )
 
+    def render_recent_steps(self) -> Panel:
+        """Render panel showing recent step executions."""
+        if not self.recent_steps:
+            content = Text("No steps yet", style="dim")
+        else:
+            lines = []
+            for (
+                config_name,
+                challenge,
+                step_num,
+                tool_name,
+                is_error,
+            ) in self.recent_steps[-5:]:
+                color = self.get_config_color(config_name)
+                status = (
+                    Text("\u2717", style="red")
+                    if is_error
+                    else Text("\u2713", style="green")
+                )
+                lines.append(
+                    Text.assemble(
+                        ("  ", ""),
+                        status,
+                        (" ", ""),
+                        (f"[{config_name}]", color),
+                        (" ", ""),
+                        (f"{challenge} #{step_num}: ", "dim"),
+                        (tool_name, "white"),
+                    )
+                )
+            content = Group(*lines)
+
+        return Panel(
+            content,
+            title="[bold]Recent Steps[/bold]",
+            border_style="dim",
+        )
+
     def render_live_display(self) -> Group:
         """Render the full live display."""
         return Group(
@@ -171,9 +266,15 @@ class BenchmarkUI:
             "",
             self.render_active_runs(),
             "",
+            self.render_recent_steps(),
+            "",
             self.render_recent_completions(),
         )
 
+    def __rich_console__(self, console: Console, options) -> RenderableType:
+        """Support for Rich Live display - called on each refresh."""
+        yield self.render_live_display()
+
     def print_final_summary(self) -> None:
         """Print final summary after all benchmarks complete."""
         elapsed = (