mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-21 04:57:58 -05:00
feat(direct_benchmark): add step-level logging with colored prefixes
- Add step callback to AgentRunner for real-time step logging - BenchmarkUI now shows: - Active runs with current step info - Recent steps panel with colored config prefixes - Proper Live display refresh (implements __rich_console__) - Each config gets a distinct color for easy identification - Verbose mode prints step logs immediately with config prefix - Fix Live display not updating (pass UI object, not rendered content) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
61
.github/workflows/classic-autogpt-ci.yml
vendored
61
.github/workflows/classic-autogpt-ci.yml
vendored
@@ -30,40 +30,15 @@ jobs:
|
||||
permissions:
|
||||
contents: read
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.12", "3.13", "3.14"]
|
||||
platform-os: [ubuntu, macos, macos-arm64, windows]
|
||||
runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
# Quite slow on macOS (2~4 minutes to set up Docker)
|
||||
# - name: Set up Docker (macOS)
|
||||
# if: runner.os == 'macOS'
|
||||
# uses: crazy-max/ghaction-setup-docker@v3
|
||||
|
||||
- name: Start MinIO service (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
- name: Start MinIO service
|
||||
working-directory: '.'
|
||||
run: |
|
||||
docker pull minio/minio:edge-cicd
|
||||
docker run -d -p 9000:9000 minio/minio:edge-cicd
|
||||
|
||||
- name: Start MinIO service (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
working-directory: ${{ runner.temp }}
|
||||
run: |
|
||||
brew install minio/stable/minio
|
||||
mkdir data
|
||||
minio server ./data &
|
||||
|
||||
# No MinIO on Windows:
|
||||
# - Windows doesn't support running Linux Docker containers
|
||||
# - It doesn't seem possible to start background processes on Windows. They are
|
||||
# killed after the step returns.
|
||||
# See: https://github.com/actions/runner/issues/598#issuecomment-2011890429
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -75,41 +50,23 @@ jobs:
|
||||
git config --global user.name "Auto-GPT-Bot"
|
||||
git config --global user.email "github-bot@agpt.co"
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
- name: Set up Python 3.12
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
python-version: "3.12"
|
||||
|
||||
- id: get_date
|
||||
name: Get date
|
||||
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Python dependency cache
|
||||
# On Windows, unpacking cached dependencies takes longer than just installing them
|
||||
if: runner.os != 'Windows'
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
|
||||
path: ~/.cache/pypoetry
|
||||
key: poetry-${{ runner.os }}-${{ hashFiles('classic/original_autogpt/poetry.lock') }}
|
||||
|
||||
- name: Install Poetry (Unix)
|
||||
if: runner.os != 'Windows'
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
if [ "${{ runner.os }}" = "macOS" ]; then
|
||||
PATH="$HOME/.local/bin:$PATH"
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
|
||||
- name: Install Poetry (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
run: |
|
||||
(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
|
||||
|
||||
$env:PATH += ";$env:APPDATA\Python\Scripts"
|
||||
echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH
|
||||
- name: Install Poetry
|
||||
run: curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: poetry install
|
||||
@@ -129,7 +86,7 @@ jobs:
|
||||
CI: true
|
||||
PLAIN_OUTPUT: True
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }}
|
||||
S3_ENDPOINT_URL: http://127.0.0.1:9000
|
||||
AWS_ACCESS_KEY_ID: minioadmin
|
||||
AWS_SECRET_ACCESS_KEY: minioadmin
|
||||
|
||||
@@ -143,7 +100,7 @@ jobs:
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
flags: autogpt-agent,${{ runner.os }}
|
||||
flags: autogpt-agent
|
||||
|
||||
- name: Upload logs to artifact
|
||||
if: always()
|
||||
|
||||
63
.github/workflows/classic-forge-ci.yml
vendored
63
.github/workflows/classic-forge-ci.yml
vendored
@@ -26,74 +26,31 @@ jobs:
|
||||
permissions:
|
||||
contents: read
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.12", "3.13", "3.14"]
|
||||
platform-os: [ubuntu, macos, macos-arm64, windows]
|
||||
runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
# Quite slow on macOS (2~4 minutes to set up Docker)
|
||||
# - name: Set up Docker (macOS)
|
||||
# if: runner.os == 'macOS'
|
||||
# uses: crazy-max/ghaction-setup-docker@v3
|
||||
|
||||
- name: Start MinIO service (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
- name: Start MinIO service
|
||||
working-directory: '.'
|
||||
run: |
|
||||
docker pull minio/minio:edge-cicd
|
||||
docker run -d -p 9000:9000 minio/minio:edge-cicd
|
||||
|
||||
- name: Start MinIO service (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
working-directory: ${{ runner.temp }}
|
||||
run: |
|
||||
brew install minio/stable/minio
|
||||
mkdir data
|
||||
minio server ./data &
|
||||
|
||||
# No MinIO on Windows:
|
||||
# - Windows doesn't support running Linux Docker containers
|
||||
# - It doesn't seem possible to start background processes on Windows. They are
|
||||
# killed after the step returns.
|
||||
# See: https://github.com/actions/runner/issues/598#issuecomment-2011890429
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
- name: Set up Python 3.12
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Set up Python dependency cache
|
||||
# On Windows, unpacking cached dependencies takes longer than just installing them
|
||||
if: runner.os != 'Windows'
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
|
||||
path: ~/.cache/pypoetry
|
||||
key: poetry-${{ runner.os }}-${{ hashFiles('classic/forge/poetry.lock') }}
|
||||
|
||||
- name: Install Poetry (Unix)
|
||||
if: runner.os != 'Windows'
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
if [ "${{ runner.os }}" = "macOS" ]; then
|
||||
PATH="$HOME/.local/bin:$PATH"
|
||||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
|
||||
- name: Install Poetry (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
run: |
|
||||
(Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
|
||||
|
||||
$env:PATH += ";$env:APPDATA\Python\Scripts"
|
||||
echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH
|
||||
- name: Install Poetry
|
||||
run: curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: poetry install
|
||||
@@ -112,7 +69,7 @@ jobs:
|
||||
# Secrets are not available to fork PRs (GitHub security feature)
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
S3_ENDPOINT_URL: ${{ runner.os != 'Windows' && 'http://127.0.0.1:9000' || '' }}
|
||||
S3_ENDPOINT_URL: http://127.0.0.1:9000
|
||||
AWS_ACCESS_KEY_ID: minioadmin
|
||||
AWS_SECRET_ACCESS_KEY: minioadmin
|
||||
|
||||
@@ -126,11 +83,11 @@ jobs:
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
flags: forge,${{ runner.os }}
|
||||
flags: forge
|
||||
|
||||
- name: Upload logs to artifact
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test-logs-${{ matrix.platform-os }}-${{ matrix.python-version }}
|
||||
name: test-logs
|
||||
path: classic/forge/logs/
|
||||
|
||||
@@ -4,13 +4,12 @@ from pathlib import Path
|
||||
from typing import Any, AsyncIterator, Awaitable, ClassVar, Optional
|
||||
|
||||
import pytest
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
from agent_protocol_client import AgentApi, Step
|
||||
from colorama import Fore, Style
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -183,4 +182,5 @@ class BaseChallenge(ABC):
|
||||
@abstractmethod
|
||||
async def evaluate_task_state(
|
||||
cls, agent: AgentApi, task_id: str
|
||||
) -> list[EvalResult]: ...
|
||||
) -> list[EvalResult]:
|
||||
...
|
||||
|
||||
@@ -10,6 +10,16 @@ from pathlib import Path
|
||||
from typing import Annotated, Any, ClassVar, Iterator, Literal, Optional
|
||||
|
||||
import pytest
|
||||
from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
|
||||
from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
from agbenchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
FEW_SHOT_EXAMPLES,
|
||||
PROMPT_MAP,
|
||||
SCORING_MAP,
|
||||
)
|
||||
from agent_protocol_client import AgentApi, ApiClient
|
||||
from agent_protocol_client import Configuration as ClientConfig
|
||||
from agent_protocol_client import Step
|
||||
@@ -23,17 +33,6 @@ from pydantic import (
|
||||
field_validator,
|
||||
)
|
||||
|
||||
from agbenchmark.agent_api_interface import download_agent_artifacts_into_folder
|
||||
from agbenchmark.agent_interface import copy_challenge_artifacts_into_workspace
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, DifficultyLevel, EvalResult
|
||||
from agbenchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
FEW_SHOT_EXAMPLES,
|
||||
PROMPT_MAP,
|
||||
SCORING_MAP,
|
||||
)
|
||||
|
||||
from .base import BaseChallenge, ChallengeInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -69,9 +68,9 @@ class BuiltinChallengeSpec(BaseModel):
|
||||
class Eval(BaseModel):
|
||||
type: str
|
||||
scoring: Optional[Literal["percentage", "scale", "binary"]] = None
|
||||
template: Optional[Literal["rubric", "reference", "question", "custom"]] = (
|
||||
None
|
||||
)
|
||||
template: Optional[
|
||||
Literal["rubric", "reference", "question", "custom"]
|
||||
] = None
|
||||
examples: Optional[str] = None
|
||||
|
||||
@field_validator("scoring", "template")
|
||||
|
||||
@@ -5,11 +5,10 @@ from typing import ClassVar, Iterator, Literal
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from agent_protocol_client import AgentApi, Step
|
||||
from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
|
||||
|
||||
from agbenchmark.config import AgentBenchmarkConfig
|
||||
from agbenchmark.utils.data_types import Category, EvalResult
|
||||
from agent_protocol_client import AgentApi, Step
|
||||
from pydantic import BaseModel, ValidationError, ValidationInfo, field_validator
|
||||
|
||||
from .base import BaseChallenge, ChallengeInfo
|
||||
|
||||
@@ -84,11 +83,13 @@ def resolve_uri(uri: str) -> str:
|
||||
|
||||
class Eval(ABC):
|
||||
@abstractmethod
|
||||
def evaluate(self, string: str) -> bool: ...
|
||||
def evaluate(self, string: str) -> bool:
|
||||
...
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def description(self) -> str: ...
|
||||
def description(self) -> str:
|
||||
...
|
||||
|
||||
|
||||
class BaseStringEval(BaseModel, Eval):
|
||||
|
||||
@@ -88,10 +88,16 @@ class BenchmarkHarness:
|
||||
if progress.result:
|
||||
all_results[progress.config_name].append(progress.result)
|
||||
|
||||
# Create step callback if UI supports it
|
||||
step_callback = None
|
||||
if hasattr(ui, "log_step"):
|
||||
step_callback = ui.log_step
|
||||
|
||||
# Create executor
|
||||
executor = ParallelExecutor(
|
||||
max_parallel=self.config.max_parallel,
|
||||
on_progress=on_progress,
|
||||
on_step=step_callback,
|
||||
attempts=self.config.attempts,
|
||||
no_cutoff=self.config.no_cutoff,
|
||||
)
|
||||
@@ -101,7 +107,8 @@ class BenchmarkHarness:
|
||||
|
||||
# Run with or without live display
|
||||
if isinstance(ui, BenchmarkUI) and ui_mode == "default":
|
||||
with Live(ui.render_live_display(), console=console, refresh_per_second=4):
|
||||
# Pass the UI object itself so Live can refresh it
|
||||
with Live(ui, console=console, refresh_per_second=4):
|
||||
async for _ in executor.execute_matrix(
|
||||
self.config.configs,
|
||||
challenges,
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import AsyncIterator, Callable, Optional
|
||||
|
||||
from .evaluator import Evaluator
|
||||
from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress
|
||||
from .runner import AgentRunner
|
||||
from .runner import AgentRunner, StepCallback
|
||||
|
||||
|
||||
class ParallelExecutor:
|
||||
@@ -16,11 +16,13 @@ class ParallelExecutor:
|
||||
self,
|
||||
max_parallel: int = 4,
|
||||
on_progress: Optional[Callable[[ExecutionProgress], None]] = None,
|
||||
on_step: Optional[StepCallback] = None,
|
||||
attempts: int = 1,
|
||||
no_cutoff: bool = False,
|
||||
):
|
||||
self.max_parallel = max_parallel
|
||||
self.on_progress = on_progress
|
||||
self.on_step = on_step
|
||||
self.attempts = attempts
|
||||
self.no_cutoff = no_cutoff
|
||||
self._semaphore = asyncio.Semaphore(max_parallel)
|
||||
@@ -86,7 +88,12 @@ class ParallelExecutor:
|
||||
)
|
||||
|
||||
# Run the challenge (with modified timeout if no_cutoff is set)
|
||||
runner = AgentRunner(config, workspace_root, no_cutoff=self.no_cutoff)
|
||||
runner = AgentRunner(
|
||||
config,
|
||||
workspace_root,
|
||||
no_cutoff=self.no_cutoff,
|
||||
step_callback=self.on_step,
|
||||
)
|
||||
result = await runner.run_challenge(challenge, attempt=attempt)
|
||||
|
||||
# Evaluate result
|
||||
|
||||
@@ -5,17 +5,20 @@ import shutil
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from forge.file_storage import FileStorageBackendName, get_storage
|
||||
from forge.llm.providers import MultiProvider
|
||||
from typing import Callable, Optional
|
||||
|
||||
from autogpt.agent_factory.configurators import create_agent
|
||||
from autogpt.agents.agent import Agent
|
||||
from autogpt.app.config import AppConfig, ConfigBuilder
|
||||
from forge.file_storage import FileStorageBackendName, get_storage
|
||||
from forge.llm.providers import MultiProvider
|
||||
|
||||
from .models import BenchmarkConfig, Challenge, ChallengeResult, StepResult
|
||||
|
||||
# Type for step logging callback
|
||||
StepCallback = Callable[[str, str, int, str, str, bool], None]
|
||||
# Args: config_name, challenge_name, step_num, tool_name, result_preview, is_error
|
||||
|
||||
|
||||
class AgentRunner:
|
||||
"""Runs a single agent instance for a challenge."""
|
||||
@@ -25,10 +28,12 @@ class AgentRunner:
|
||||
config: BenchmarkConfig,
|
||||
workspace_root: Path,
|
||||
no_cutoff: bool = False,
|
||||
step_callback: Optional[StepCallback] = None,
|
||||
):
|
||||
self.config = config
|
||||
self.workspace_root = workspace_root
|
||||
self.no_cutoff = no_cutoff
|
||||
self.step_callback = step_callback
|
||||
self._agent: Optional[Agent] = None
|
||||
self._workspace: Optional[Path] = None
|
||||
|
||||
@@ -210,20 +215,42 @@ class AgentRunner:
|
||||
step_cost = 0.0 # TODO: Extract from LLM provider
|
||||
cumulative_cost += step_cost
|
||||
|
||||
# Get result info
|
||||
result_str = str(
|
||||
result.outputs if hasattr(result, "outputs") else result
|
||||
)
|
||||
is_error = hasattr(result, "status") and result.status == "error"
|
||||
|
||||
# Record step
|
||||
steps.append(
|
||||
StepResult(
|
||||
step_num=step_num + 1,
|
||||
tool_name=proposal.use_tool.name,
|
||||
tool_args=proposal.use_tool.arguments,
|
||||
result=str(
|
||||
result.outputs if hasattr(result, "outputs") else result
|
||||
),
|
||||
is_error=hasattr(result, "status") and result.status == "error",
|
||||
result=result_str,
|
||||
is_error=is_error,
|
||||
cumulative_cost=cumulative_cost,
|
||||
)
|
||||
)
|
||||
|
||||
# Call step callback if provided
|
||||
if self.step_callback:
|
||||
# Truncate result for display
|
||||
result_preview = (
|
||||
result_str[:100] + "..."
|
||||
if len(result_str) > 100
|
||||
else result_str
|
||||
)
|
||||
result_preview = result_preview.replace("\n", " ")
|
||||
self.step_callback(
|
||||
self.config.config_name,
|
||||
challenge.name,
|
||||
step_num + 1,
|
||||
proposal.use_tool.name,
|
||||
result_preview,
|
||||
is_error,
|
||||
)
|
||||
|
||||
return False # Hit max steps
|
||||
|
||||
# Run with or without timeout
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console, Group
|
||||
from rich.console import Console, Group, RenderableType
|
||||
from rich.live import Live
|
||||
from rich.panel import Panel
|
||||
from rich.progress import (
|
||||
@@ -22,6 +22,18 @@ from .models import ChallengeResult, ExecutionProgress
|
||||
|
||||
console = Console()
|
||||
|
||||
# Colors for different configs (cycling through for parallel runs)
|
||||
CONFIG_COLORS = [
|
||||
"cyan",
|
||||
"green",
|
||||
"yellow",
|
||||
"magenta",
|
||||
"blue",
|
||||
"red",
|
||||
"bright_cyan",
|
||||
"bright_green",
|
||||
]
|
||||
|
||||
|
||||
class BenchmarkUI:
|
||||
"""Rich UI for benchmark progress and results."""
|
||||
@@ -33,8 +45,11 @@ class BenchmarkUI:
|
||||
|
||||
# Track state
|
||||
self.active_runs: dict[str, str] = {} # config_name -> challenge_name
|
||||
self.active_steps: dict[str, str] = {} # config_name -> current step info
|
||||
self.completed: list[ChallengeResult] = []
|
||||
self.results_by_config: dict[str, list[ChallengeResult]] = {}
|
||||
self.recent_steps: list[tuple[str, str, int, str, bool]] = [] # Last N steps
|
||||
self.config_colors: dict[str, str] = {} # config_name -> color
|
||||
|
||||
# Progress tracking
|
||||
self.progress = Progress(
|
||||
@@ -53,17 +68,56 @@ class BenchmarkUI:
|
||||
self.start_time = datetime.now()
|
||||
self.total_challenges = total_challenges
|
||||
self.results_by_config = {config: [] for config in configs}
|
||||
# Assign colors to configs
|
||||
for i, config in enumerate(configs):
|
||||
self.config_colors[config] = CONFIG_COLORS[i % len(CONFIG_COLORS)]
|
||||
self.main_task = self.progress.add_task(
|
||||
"[cyan]Running benchmarks...", total=total_challenges
|
||||
)
|
||||
|
||||
def get_config_color(self, config_name: str) -> str:
|
||||
"""Get the assigned color for a config."""
|
||||
return self.config_colors.get(config_name, "white")
|
||||
|
||||
def log_step(
|
||||
self,
|
||||
config_name: str,
|
||||
challenge_name: str,
|
||||
step_num: int,
|
||||
tool_name: str,
|
||||
result_preview: str,
|
||||
is_error: bool,
|
||||
) -> None:
|
||||
"""Log a step execution (called from AgentRunner)."""
|
||||
# Update active step info
|
||||
self.active_steps[config_name] = f"step {step_num}: {tool_name}"
|
||||
|
||||
# Add to recent steps (keep last 10)
|
||||
self.recent_steps.append(
|
||||
(config_name, challenge_name, step_num, tool_name, is_error)
|
||||
)
|
||||
if len(self.recent_steps) > 10:
|
||||
self.recent_steps.pop(0)
|
||||
|
||||
# In verbose mode, print immediately
|
||||
if self.verbose:
|
||||
color = self.get_config_color(config_name)
|
||||
status = "[red]ERR[/red]" if is_error else "[green]OK[/green]"
|
||||
console.print(
|
||||
f"[{color}][{config_name}][/{color}] {challenge_name} "
|
||||
f"step {step_num}: {tool_name} {status}"
|
||||
)
|
||||
|
||||
def update(self, progress: ExecutionProgress) -> None:
|
||||
"""Update UI with execution progress."""
|
||||
if progress.status == "starting":
|
||||
self.active_runs[progress.config_name] = progress.challenge_name
|
||||
self.active_steps[progress.config_name] = "starting..."
|
||||
elif progress.status in ("completed", "failed"):
|
||||
if progress.config_name in self.active_runs:
|
||||
del self.active_runs[progress.config_name]
|
||||
if progress.config_name in self.active_steps:
|
||||
del self.active_steps[progress.config_name]
|
||||
if progress.result:
|
||||
self.completed.append(progress.result)
|
||||
self.results_by_config[progress.config_name].append(progress.result)
|
||||
@@ -84,17 +138,20 @@ class BenchmarkUI:
|
||||
def render_active_runs(self) -> Panel:
|
||||
"""Render panel showing active runs."""
|
||||
if not self.active_runs:
|
||||
content = Text("No active runs", style="dim")
|
||||
content = Text("Waiting for runs to start...", style="dim")
|
||||
else:
|
||||
lines = []
|
||||
for config_name, challenge_name in self.active_runs.items():
|
||||
color = self.get_config_color(config_name)
|
||||
step_info = self.active_steps.get(config_name, "")
|
||||
lines.append(
|
||||
Text.assemble(
|
||||
(" ", ""),
|
||||
("\u25cf ", "yellow"), # Bullet point
|
||||
(f"{config_name}", "cyan"),
|
||||
(f"{config_name}", color),
|
||||
(" \u2192 ", "dim"), # Arrow
|
||||
(challenge_name, "white"),
|
||||
(f" ({step_info})", "dim") if step_info else ("", ""),
|
||||
)
|
||||
)
|
||||
content = Group(*lines)
|
||||
@@ -164,6 +221,44 @@ class BenchmarkUI:
|
||||
border_style="green" if self.completed else "dim",
|
||||
)
|
||||
|
||||
def render_recent_steps(self) -> Panel:
|
||||
"""Render panel showing recent step executions."""
|
||||
if not self.recent_steps:
|
||||
content = Text("No steps yet", style="dim")
|
||||
else:
|
||||
lines = []
|
||||
for (
|
||||
config_name,
|
||||
challenge,
|
||||
step_num,
|
||||
tool_name,
|
||||
is_error,
|
||||
) in self.recent_steps[-5:]:
|
||||
color = self.get_config_color(config_name)
|
||||
status = (
|
||||
Text("\u2717", style="red")
|
||||
if is_error
|
||||
else Text("\u2713", style="green")
|
||||
)
|
||||
lines.append(
|
||||
Text.assemble(
|
||||
(" ", ""),
|
||||
status,
|
||||
(" ", ""),
|
||||
(f"[{config_name}]", color),
|
||||
(" ", ""),
|
||||
(f"{challenge} #{step_num}: ", "dim"),
|
||||
(tool_name, "white"),
|
||||
)
|
||||
)
|
||||
content = Group(*lines)
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title="[bold]Recent Steps[/bold]",
|
||||
border_style="dim",
|
||||
)
|
||||
|
||||
def render_live_display(self) -> Group:
|
||||
"""Render the full live display."""
|
||||
return Group(
|
||||
@@ -171,9 +266,15 @@ class BenchmarkUI:
|
||||
"",
|
||||
self.render_active_runs(),
|
||||
"",
|
||||
self.render_recent_steps(),
|
||||
"",
|
||||
self.render_recent_completions(),
|
||||
)
|
||||
|
||||
def __rich_console__(self, console: Console, options) -> RenderableType:
|
||||
"""Support for Rich Live display - called on each refresh."""
|
||||
yield self.render_live_display()
|
||||
|
||||
def print_final_summary(self) -> None:
|
||||
"""Print final summary after all benchmarks complete."""
|
||||
elapsed = (
|
||||
|
||||
Reference in New Issue
Block a user