fix(classic): resolve linting and formatting issues for CI compliance

- Update .flake8 config to exclude workspace directories and ignore E203
- Fix import sorting (isort) across multiple files
- Fix code formatting (black) across multiple files
- Remove unused imports and fix line length issues (flake8)
- Fix f-strings without placeholders and unused variables

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Nicholas Tindle
2026-01-20 01:16:38 -06:00
parent 6f2783468c
commit 60fdee1345
12 changed files with 111 additions and 79 deletions

View File

@@ -1,12 +1,15 @@
[flake8]
max-line-length = 88
extend-ignore = E203
exclude =
.tox,
__pycache__,
*.pyc,
.env
venv*/*,
.venv/*,
reports/*,
dist/*,
data/*,
.env,
venv*,
.venv,
reports,
dist,
data,
.benchmark_workspaces,
.autogpt,

View File

@@ -27,8 +27,6 @@ Usage:
import argparse
import json
import os
import re
import sys
from collections import Counter, defaultdict
from dataclasses import dataclass, field
@@ -52,10 +50,10 @@ Tree: Any = None
try:
from rich.console import Console
from rich.markdown import Markdown
from rich.markdown import Markdown # noqa: F401
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.prompt import Confirm, Prompt
from rich.prompt import Confirm, Prompt # noqa: F401
from rich.table import Table
from rich.text import Text
from rich.tree import Tree
@@ -277,7 +275,9 @@ class FailureAnalyzer:
}
planning_count = sum(test.tool_distribution.get(t, 0) for t in planning_tools)
execution_count = sum(test.tool_distribution.get(t, 0) for t in execution_tools)
_execution_count = sum( # noqa: F841
test.tool_distribution.get(t, 0) for t in execution_tools
)
if test.n_steps > 0:
planning_ratio = planning_count / test.n_steps
@@ -402,17 +402,22 @@ class FailureAnalyzer:
self.console.print(table)
else:
print("\n=== Strategy Comparison Summary ===")
print(
f"{'Strategy':<20} {'Tests':>6} {'Passed':>7} {'Failed':>7} {'Success%':>10} {'AvgSteps':>9} {'Cost':>10}"
hdr = (
f"{'Strategy':<20} {'Tests':>6} {'Passed':>7} "
f"{'Failed':>7} {'Success%':>10} {'AvgSteps':>9} {'Cost':>10}"
)
print(hdr)
print("-" * 80)
for name, analysis in sorted(
self.strategies.items(), key=lambda x: x[1].success_rate, reverse=True
):
print(
f"{name:<20} {analysis.total_tests:>6} {analysis.passed:>7} {analysis.failed:>7} "
f"{analysis.success_rate:>9.1f}% {analysis.avg_steps:>9.1f} ${analysis.total_cost:>9.4f}"
row = (
f"{name:<20} {analysis.total_tests:>6} "
f"{analysis.passed:>7} {analysis.failed:>7} "
f"{analysis.success_rate:>9.1f}% {analysis.avg_steps:>9.1f} "
f"${analysis.total_cost:>9.4f}"
)
print(row)
def print_pattern_analysis(self) -> None:
"""Print failure pattern analysis."""
@@ -430,12 +435,12 @@ class FailureAnalyzer:
table.add_column("Description")
pattern_descriptions = {
FailurePattern.OVER_PLANNING: "Agent spends too much time planning without executing",
FailurePattern.TOOL_LOOP: "Agent repeats same tool 3+ times consecutively",
FailurePattern.MISSING_CRITICAL: "Agent never performed key action (e.g., write_file)",
FailurePattern.TIMEOUT: "Agent hit step limit before completing task",
FailurePattern.ERROR_UNRECOVERED: "Agent hit errors and couldn't recover",
FailurePattern.WRONG_APPROACH: "Agent took fundamentally wrong approach",
FailurePattern.OVER_PLANNING: "Too much planning, not enough action",
FailurePattern.TOOL_LOOP: "Repeats same tool 3+ times consecutively",
FailurePattern.MISSING_CRITICAL: "Never performed key action",
FailurePattern.TIMEOUT: "Hit step limit before completing task",
FailurePattern.ERROR_UNRECOVERED: "Hit errors and couldn't recover",
FailurePattern.WRONG_APPROACH: "Took fundamentally wrong approach",
FailurePattern.UNKNOWN: "Pattern not categorized",
}
@@ -457,15 +462,13 @@ class FailureAnalyzer:
)
for analysis in strategies_to_show:
self._print(f"\n")
self._print("\n")
if RICH_AVAILABLE:
self.console.print(
Panel(
f"[bold]{analysis.strategy_name}[/bold] - "
f"{analysis.failed} failures out of {analysis.total_tests} tests",
title="Strategy Analysis",
)
msg = (
f"[bold]{analysis.strategy_name}[/bold] - "
f"{analysis.failed} failures out of {analysis.total_tests} tests"
)
self.console.print(Panel(msg, title="Strategy Analysis"))
else:
print(f"\n=== {analysis.strategy_name} ===")
print(f"Failures: {analysis.failed}/{analysis.total_tests}")
@@ -480,9 +483,8 @@ class FailureAnalyzer:
tree.add(f"[dim]Task:[/dim] {test.task[:80]}...")
tree.add(f"[dim]Steps:[/dim] {test.n_steps}")
tree.add(f"[dim]Cost:[/dim] ${test.total_cost:.4f}")
tree.add(
f"[dim]Patterns:[/dim] {', '.join(p.value for p in test.patterns_detected)}"
)
patterns = ", ".join(p.value for p in test.patterns_detected)
tree.add(f"[dim]Patterns:[/dim] {patterns}")
tools = tree.add("[dim]Tool sequence:[/dim]")
tool_seq = [s.tool_name for s in test.steps[:10]]
@@ -514,14 +516,14 @@ class FailureAnalyzer:
return
results = self.test_comparison[test_name]
self._print(f"\n")
self._print("\n")
if RICH_AVAILABLE:
self.console.print(Panel(f"[bold]Comparing: {test_name}[/bold]"))
else:
print(f"\n=== Comparing: {test_name} ===")
for strategy, test in sorted(results.items()):
self._print(f"\n")
self._print("\n")
if RICH_AVAILABLE:
self.console.print(f"[cyan]--- {strategy} ---[/cyan]")
else:
@@ -666,9 +668,8 @@ class FailureAnalyzer:
self.console.print(f" [dim]{key}:[/dim] {value}")
if step.tool_result:
self.console.print(
f"\n[cyan]Result:[/cyan] {json.dumps(step.tool_result, indent=2)[:500]}"
)
result_str = json.dumps(step.tool_result, indent=2)[:500]
self.console.print(f"\n[cyan]Result:[/cyan] {result_str}")
self.console.print(
f"\n[cyan]Cumulative Cost:[/cyan] ${step.cumulative_cost:.4f}"
@@ -702,10 +703,12 @@ class FailureAnalyzer:
for name, analysis in sorted(
self.strategies.items(), key=lambda x: x[1].success_rate, reverse=True
):
lines.append(
f"| {name} | {analysis.total_tests} | {analysis.passed} | {analysis.failed} | "
f"{analysis.success_rate:.1f}% | {analysis.avg_steps:.1f} | ${analysis.total_cost:.4f} |"
row = (
f"| {name} | {analysis.total_tests} | {analysis.passed} "
f"| {analysis.failed} | {analysis.success_rate:.1f}% "
f"| {analysis.avg_steps:.1f} | ${analysis.total_cost:.4f} |"
)
lines.append(row)
# Pattern analysis
lines.append("\n## Failure Patterns\n")
@@ -728,12 +731,10 @@ class FailureAnalyzer:
lines.append(f"#### {test.test_name}\n")
lines.append(f"- **Task**: {test.task[:100]}...")
lines.append(f"- **Steps**: {test.n_steps}")
lines.append(
f"- **Patterns**: {', '.join(p.value for p in test.patterns_detected)}"
)
lines.append(
f"- **Tool sequence**: {' -> '.join(s.tool_name for s in test.steps[:8])}"
)
patterns = ", ".join(p.value for p in test.patterns_detected)
lines.append(f"- **Patterns**: {patterns}")
tools = " -> ".join(s.tool_name for s in test.steps[:8])
lines.append(f"- **Tool sequence**: {tools}")
if test.fail_reason:
lines.append(f"- **Fail reason**: {test.fail_reason[:150]}...")
lines.append("")

View File

@@ -47,7 +47,7 @@ for report_file in sorted(report_files):
data = json.load(f)
if "tests" in data:
test_tree = data["tests"]
# Handle both old format (agent_git_commit_sha) and new format (config_name in folder)
# Handle old format (agent_git_commit_sha) and new (config_name)
if "config" in data and "config_name" in data["config"]:
label = data["config"]["config_name"]
elif "agent_git_commit_sha" in data and "/" in data["agent_git_commit_sha"]:
@@ -117,7 +117,7 @@ for report_file in sorted(report_files):
if test_name not in test_names:
test_names.append(test_name)
# Handle both old format (success: bool) and new format (success_percentage: float)
# Handle old format (success: bool) and new (success_percentage)
if "success" in test_metrics:
success_value = test_metrics["success"]
elif "success_percentage" in test_metrics:

View File

@@ -14,7 +14,6 @@ from .models import (
STRATEGIES,
BenchmarkConfig,
HarnessConfig,
ModelConfig,
)
from .ui import console
@@ -166,7 +165,7 @@ def cli():
"--ci",
"ci_mode",
is_flag=True,
help="CI mode: no live display, but shows completion blocks. Auto-enabled when CI env var is set.",
help="CI mode: no live display. Auto-enabled when CI env var is set.",
)
@click.option(
"--fresh",
@@ -546,9 +545,8 @@ def state_reset(
from .state import StateManager
if not strategies and not models and not challenges:
console.print(
"[red]Must specify at least one of --strategy, --model, or --challenge[/red]"
)
msg = "[red]Must specify --strategy, --model, or --challenge[/red]"
console.print(msg)
sys.exit(1)
if reports_dir is None:

View File

@@ -162,7 +162,7 @@ def find_challenges_dir() -> Optional[Path]:
Looks for common relative paths from the current working directory
and the package location.
"""
# First check relative to this file's location (preferred - challenges are in direct_benchmark/)
# First check relative to this file's location (preferred)
pkg_dir = Path(__file__).parent.parent
local_challenges = pkg_dir / "challenges"
if local_challenges.exists() and (local_challenges / "abilities").exists():

View File

@@ -5,7 +5,6 @@ import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Optional
from .models import Challenge, ChallengeResult

View File

@@ -3,8 +3,7 @@
import asyncio
import re
from datetime import datetime
from pathlib import Path
from typing import Optional, Union
from typing import Union
from rich.live import Live
@@ -104,10 +103,11 @@ class BenchmarkHarness:
):
prev_completed = self.state_manager.get_completed_count()
if prev_completed > 0:
console.print(
f"[yellow]Warning: Config changed from previous run "
f"({prev_completed} completed). Use --fresh to start over.[/yellow]"
msg = (
f"[yellow]Warning: Config changed ({prev_completed} "
f"completed). Use --fresh to start over.[/yellow]"
)
console.print(msg)
self.state_manager.reset()
# Save current config for future mismatch detection

View File

@@ -36,7 +36,13 @@ class ReportGenerator:
tests = {}
total_cost = 0.0
highest_difficulty = "interface"
difficulty_order = ["interface", "basic", "intermediate", "advanced", "hard"]
_difficulty_order = [ # noqa: F841
"interface",
"basic",
"intermediate",
"advanced",
"hard",
]
for result in results:
total_cost += result.cost
@@ -63,6 +69,17 @@ class ReportGenerator:
"reached_cutoff": result.timed_out,
"n_steps": result.n_steps,
"cost": result.cost,
"steps": [
{
"step_num": step.step_num,
"tool_name": step.tool_name,
"tool_args": step.tool_args,
"result": step.result,
"is_error": step.is_error,
"cost": step.cumulative_cost,
}
for step in result.steps
],
}
],
}
@@ -137,6 +154,22 @@ class ReportGenerator:
"success": r.success,
"n_steps": r.n_steps,
"cost": r.cost,
"error": r.error_message,
"timed_out": r.timed_out,
"steps": [
{
"step": s.step_num,
"tool": s.tool_name,
"args": s.tool_args,
"result": (
s.result[:500] + "..."
if len(s.result) > 500
else s.result
),
"error": s.is_error,
}
for s in r.steps
],
}
for r in results
},

View File

@@ -9,7 +9,7 @@ from typing import Callable, Optional
from autogpt.agent_factory.configurators import create_agent
from autogpt.agents.agent import Agent
from autogpt.app.config import AppConfig, ConfigBuilder
from autogpt.app.config import ConfigBuilder
from forge.file_storage import FileStorageBackendName, get_storage
from forge.llm.providers import MultiProvider
@@ -46,7 +46,7 @@ class AgentRunner:
start_time = datetime.now()
steps: list[StepResult] = []
# Create isolated workspace for this run (include attempt in name for uniqueness)
# Create isolated workspace for this run
prefix = f"{challenge.name}_{self.config.strategy}_"
if attempt > 1:
prefix = f"{challenge.name}_{self.config.strategy}_attempt{attempt}_"

View File

@@ -7,7 +7,6 @@ from typing import Optional
from rich.columns import Columns
from rich.console import Console, Group, RenderableType
from rich.live import Live
from rich.panel import Panel
from rich.progress import (
BarColumn,
@@ -236,9 +235,8 @@ class BenchmarkUI:
f"[{status_style} bold][{status}][/{status_style} bold] "
f"[{color}]{config_name}[/{color}] - {challenge_display}"
)
console.print(
f"[dim]Run ID: {config_name}:{challenge_name}:{result.attempt} @ {timestamp}[/dim]"
)
run_id = f"{config_name}:{challenge_name}:{result.attempt}"
console.print(f"[dim]Run ID: {run_id} @ {timestamp}[/dim]")
console.print(f"[{status_style}]{'' * 70}[/{status_style}]")
# Print steps
@@ -252,9 +250,11 @@ class BenchmarkUI:
# Print summary
console.print()
console.print(
f" [dim]Steps: {result.n_steps} | Time: {result.run_time_seconds:.1f}s | Cost: ${result.cost:.4f}[/dim]"
stats = (
f"Steps: {result.n_steps} | Time: {result.run_time_seconds:.1f}s "
f"| Cost: ${result.cost:.4f}"
)
console.print(f" [dim]{stats}[/dim]")
# Print error if any (skip generic timeout message since status shows it)
if result.error_message and result.error_message != "Challenge timed out":
@@ -321,11 +321,9 @@ class BenchmarkUI:
rows.append(Columns(panels[i : i + max_cols], equal=True, expand=True))
content = Group(*rows)
return Panel(
content,
title=f"[bold]Active Runs ({len(self.active_runs)}/{self.max_parallel})[/bold]",
border_style="blue",
)
active = len(self.active_runs)
title = f"[bold]Active Runs ({active}/{self.max_parallel})[/bold]"
return Panel(content, title=title, border_style="blue")
def render_summary_table(self) -> Table:
"""Render summary table of results by configuration."""
@@ -440,7 +438,9 @@ class BenchmarkUI:
total_would_pass = sum(
1 for r in self.completed if r.timed_out and r.score >= 0.9
)
total_failed = len(self.completed) - total_passed - total_would_pass
_total_failed = ( # noqa: F841
len(self.completed) - total_passed - total_would_pass
)
total_cost = sum(r.cost for r in self.completed)
# Include "would pass" in the effective rate
effective_passed = total_passed + total_would_pass
@@ -507,8 +507,6 @@ class JsonUI:
self.results_by_config[progress.config_name].append(progress.result)
def print_final_summary(self) -> None:
import json
output = {
"results": {
config: {

View File

@@ -1,6 +1,6 @@
"""Tests for the web fetch component."""
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock
import httpx
import pytest

View File

@@ -7,7 +7,7 @@ Uses trafilatura for intelligent content extraction.
import logging
from typing import Iterator, Literal, Optional
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin
import httpx
import trafilatura