mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(classic): resolve linting and formatting issues for CI compliance
- Update .flake8 config to exclude workspace directories and ignore E203 - Fix import sorting (isort) across multiple files - Fix code formatting (black) across multiple files - Remove unused imports and fix line length issues (flake8) - Fix f-strings without placeholders and unused variables Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203
|
||||
exclude =
|
||||
.tox,
|
||||
__pycache__,
|
||||
*.pyc,
|
||||
.env
|
||||
venv*/*,
|
||||
.venv/*,
|
||||
reports/*,
|
||||
dist/*,
|
||||
data/*,
|
||||
.env,
|
||||
venv*,
|
||||
.venv,
|
||||
reports,
|
||||
dist,
|
||||
data,
|
||||
.benchmark_workspaces,
|
||||
.autogpt,
|
||||
|
||||
@@ -27,8 +27,6 @@ Usage:
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
@@ -52,10 +50,10 @@ Tree: Any = None
|
||||
|
||||
try:
|
||||
from rich.console import Console
|
||||
from rich.markdown import Markdown
|
||||
from rich.markdown import Markdown # noqa: F401
|
||||
from rich.panel import Panel
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||
from rich.prompt import Confirm, Prompt
|
||||
from rich.prompt import Confirm, Prompt # noqa: F401
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
from rich.tree import Tree
|
||||
@@ -277,7 +275,9 @@ class FailureAnalyzer:
|
||||
}
|
||||
|
||||
planning_count = sum(test.tool_distribution.get(t, 0) for t in planning_tools)
|
||||
execution_count = sum(test.tool_distribution.get(t, 0) for t in execution_tools)
|
||||
_execution_count = sum( # noqa: F841
|
||||
test.tool_distribution.get(t, 0) for t in execution_tools
|
||||
)
|
||||
|
||||
if test.n_steps > 0:
|
||||
planning_ratio = planning_count / test.n_steps
|
||||
@@ -402,17 +402,22 @@ class FailureAnalyzer:
|
||||
self.console.print(table)
|
||||
else:
|
||||
print("\n=== Strategy Comparison Summary ===")
|
||||
print(
|
||||
f"{'Strategy':<20} {'Tests':>6} {'Passed':>7} {'Failed':>7} {'Success%':>10} {'AvgSteps':>9} {'Cost':>10}"
|
||||
hdr = (
|
||||
f"{'Strategy':<20} {'Tests':>6} {'Passed':>7} "
|
||||
f"{'Failed':>7} {'Success%':>10} {'AvgSteps':>9} {'Cost':>10}"
|
||||
)
|
||||
print(hdr)
|
||||
print("-" * 80)
|
||||
for name, analysis in sorted(
|
||||
self.strategies.items(), key=lambda x: x[1].success_rate, reverse=True
|
||||
):
|
||||
print(
|
||||
f"{name:<20} {analysis.total_tests:>6} {analysis.passed:>7} {analysis.failed:>7} "
|
||||
f"{analysis.success_rate:>9.1f}% {analysis.avg_steps:>9.1f} ${analysis.total_cost:>9.4f}"
|
||||
row = (
|
||||
f"{name:<20} {analysis.total_tests:>6} "
|
||||
f"{analysis.passed:>7} {analysis.failed:>7} "
|
||||
f"{analysis.success_rate:>9.1f}% {analysis.avg_steps:>9.1f} "
|
||||
f"${analysis.total_cost:>9.4f}"
|
||||
)
|
||||
print(row)
|
||||
|
||||
def print_pattern_analysis(self) -> None:
|
||||
"""Print failure pattern analysis."""
|
||||
@@ -430,12 +435,12 @@ class FailureAnalyzer:
|
||||
table.add_column("Description")
|
||||
|
||||
pattern_descriptions = {
|
||||
FailurePattern.OVER_PLANNING: "Agent spends too much time planning without executing",
|
||||
FailurePattern.TOOL_LOOP: "Agent repeats same tool 3+ times consecutively",
|
||||
FailurePattern.MISSING_CRITICAL: "Agent never performed key action (e.g., write_file)",
|
||||
FailurePattern.TIMEOUT: "Agent hit step limit before completing task",
|
||||
FailurePattern.ERROR_UNRECOVERED: "Agent hit errors and couldn't recover",
|
||||
FailurePattern.WRONG_APPROACH: "Agent took fundamentally wrong approach",
|
||||
FailurePattern.OVER_PLANNING: "Too much planning, not enough action",
|
||||
FailurePattern.TOOL_LOOP: "Repeats same tool 3+ times consecutively",
|
||||
FailurePattern.MISSING_CRITICAL: "Never performed key action",
|
||||
FailurePattern.TIMEOUT: "Hit step limit before completing task",
|
||||
FailurePattern.ERROR_UNRECOVERED: "Hit errors and couldn't recover",
|
||||
FailurePattern.WRONG_APPROACH: "Took fundamentally wrong approach",
|
||||
FailurePattern.UNKNOWN: "Pattern not categorized",
|
||||
}
|
||||
|
||||
@@ -457,15 +462,13 @@ class FailureAnalyzer:
|
||||
)
|
||||
|
||||
for analysis in strategies_to_show:
|
||||
self._print(f"\n")
|
||||
self._print("\n")
|
||||
if RICH_AVAILABLE:
|
||||
self.console.print(
|
||||
Panel(
|
||||
f"[bold]{analysis.strategy_name}[/bold] - "
|
||||
f"{analysis.failed} failures out of {analysis.total_tests} tests",
|
||||
title="Strategy Analysis",
|
||||
)
|
||||
msg = (
|
||||
f"[bold]{analysis.strategy_name}[/bold] - "
|
||||
f"{analysis.failed} failures out of {analysis.total_tests} tests"
|
||||
)
|
||||
self.console.print(Panel(msg, title="Strategy Analysis"))
|
||||
else:
|
||||
print(f"\n=== {analysis.strategy_name} ===")
|
||||
print(f"Failures: {analysis.failed}/{analysis.total_tests}")
|
||||
@@ -480,9 +483,8 @@ class FailureAnalyzer:
|
||||
tree.add(f"[dim]Task:[/dim] {test.task[:80]}...")
|
||||
tree.add(f"[dim]Steps:[/dim] {test.n_steps}")
|
||||
tree.add(f"[dim]Cost:[/dim] ${test.total_cost:.4f}")
|
||||
tree.add(
|
||||
f"[dim]Patterns:[/dim] {', '.join(p.value for p in test.patterns_detected)}"
|
||||
)
|
||||
patterns = ", ".join(p.value for p in test.patterns_detected)
|
||||
tree.add(f"[dim]Patterns:[/dim] {patterns}")
|
||||
|
||||
tools = tree.add("[dim]Tool sequence:[/dim]")
|
||||
tool_seq = [s.tool_name for s in test.steps[:10]]
|
||||
@@ -514,14 +516,14 @@ class FailureAnalyzer:
|
||||
return
|
||||
|
||||
results = self.test_comparison[test_name]
|
||||
self._print(f"\n")
|
||||
self._print("\n")
|
||||
if RICH_AVAILABLE:
|
||||
self.console.print(Panel(f"[bold]Comparing: {test_name}[/bold]"))
|
||||
else:
|
||||
print(f"\n=== Comparing: {test_name} ===")
|
||||
|
||||
for strategy, test in sorted(results.items()):
|
||||
self._print(f"\n")
|
||||
self._print("\n")
|
||||
if RICH_AVAILABLE:
|
||||
self.console.print(f"[cyan]--- {strategy} ---[/cyan]")
|
||||
else:
|
||||
@@ -666,9 +668,8 @@ class FailureAnalyzer:
|
||||
self.console.print(f" [dim]{key}:[/dim] {value}")
|
||||
|
||||
if step.tool_result:
|
||||
self.console.print(
|
||||
f"\n[cyan]Result:[/cyan] {json.dumps(step.tool_result, indent=2)[:500]}"
|
||||
)
|
||||
result_str = json.dumps(step.tool_result, indent=2)[:500]
|
||||
self.console.print(f"\n[cyan]Result:[/cyan] {result_str}")
|
||||
|
||||
self.console.print(
|
||||
f"\n[cyan]Cumulative Cost:[/cyan] ${step.cumulative_cost:.4f}"
|
||||
@@ -702,10 +703,12 @@ class FailureAnalyzer:
|
||||
for name, analysis in sorted(
|
||||
self.strategies.items(), key=lambda x: x[1].success_rate, reverse=True
|
||||
):
|
||||
lines.append(
|
||||
f"| {name} | {analysis.total_tests} | {analysis.passed} | {analysis.failed} | "
|
||||
f"{analysis.success_rate:.1f}% | {analysis.avg_steps:.1f} | ${analysis.total_cost:.4f} |"
|
||||
row = (
|
||||
f"| {name} | {analysis.total_tests} | {analysis.passed} "
|
||||
f"| {analysis.failed} | {analysis.success_rate:.1f}% "
|
||||
f"| {analysis.avg_steps:.1f} | ${analysis.total_cost:.4f} |"
|
||||
)
|
||||
lines.append(row)
|
||||
|
||||
# Pattern analysis
|
||||
lines.append("\n## Failure Patterns\n")
|
||||
@@ -728,12 +731,10 @@ class FailureAnalyzer:
|
||||
lines.append(f"#### {test.test_name}\n")
|
||||
lines.append(f"- **Task**: {test.task[:100]}...")
|
||||
lines.append(f"- **Steps**: {test.n_steps}")
|
||||
lines.append(
|
||||
f"- **Patterns**: {', '.join(p.value for p in test.patterns_detected)}"
|
||||
)
|
||||
lines.append(
|
||||
f"- **Tool sequence**: {' -> '.join(s.tool_name for s in test.steps[:8])}"
|
||||
)
|
||||
patterns = ", ".join(p.value for p in test.patterns_detected)
|
||||
lines.append(f"- **Patterns**: {patterns}")
|
||||
tools = " -> ".join(s.tool_name for s in test.steps[:8])
|
||||
lines.append(f"- **Tool sequence**: {tools}")
|
||||
if test.fail_reason:
|
||||
lines.append(f"- **Fail reason**: {test.fail_reason[:150]}...")
|
||||
lines.append("")
|
||||
|
||||
@@ -47,7 +47,7 @@ for report_file in sorted(report_files):
|
||||
data = json.load(f)
|
||||
if "tests" in data:
|
||||
test_tree = data["tests"]
|
||||
# Handle both old format (agent_git_commit_sha) and new format (config_name in folder)
|
||||
# Handle old format (agent_git_commit_sha) and new (config_name)
|
||||
if "config" in data and "config_name" in data["config"]:
|
||||
label = data["config"]["config_name"]
|
||||
elif "agent_git_commit_sha" in data and "/" in data["agent_git_commit_sha"]:
|
||||
@@ -117,7 +117,7 @@ for report_file in sorted(report_files):
|
||||
if test_name not in test_names:
|
||||
test_names.append(test_name)
|
||||
|
||||
# Handle both old format (success: bool) and new format (success_percentage: float)
|
||||
# Handle old format (success: bool) and new (success_percentage)
|
||||
if "success" in test_metrics:
|
||||
success_value = test_metrics["success"]
|
||||
elif "success_percentage" in test_metrics:
|
||||
|
||||
@@ -14,7 +14,6 @@ from .models import (
|
||||
STRATEGIES,
|
||||
BenchmarkConfig,
|
||||
HarnessConfig,
|
||||
ModelConfig,
|
||||
)
|
||||
from .ui import console
|
||||
|
||||
@@ -166,7 +165,7 @@ def cli():
|
||||
"--ci",
|
||||
"ci_mode",
|
||||
is_flag=True,
|
||||
help="CI mode: no live display, but shows completion blocks. Auto-enabled when CI env var is set.",
|
||||
help="CI mode: no live display. Auto-enabled when CI env var is set.",
|
||||
)
|
||||
@click.option(
|
||||
"--fresh",
|
||||
@@ -546,9 +545,8 @@ def state_reset(
|
||||
from .state import StateManager
|
||||
|
||||
if not strategies and not models and not challenges:
|
||||
console.print(
|
||||
"[red]Must specify at least one of --strategy, --model, or --challenge[/red]"
|
||||
)
|
||||
msg = "[red]Must specify --strategy, --model, or --challenge[/red]"
|
||||
console.print(msg)
|
||||
sys.exit(1)
|
||||
|
||||
if reports_dir is None:
|
||||
|
||||
@@ -162,7 +162,7 @@ def find_challenges_dir() -> Optional[Path]:
|
||||
Looks for common relative paths from the current working directory
|
||||
and the package location.
|
||||
"""
|
||||
# First check relative to this file's location (preferred - challenges are in direct_benchmark/)
|
||||
# First check relative to this file's location (preferred)
|
||||
pkg_dir = Path(__file__).parent.parent
|
||||
local_challenges = pkg_dir / "challenges"
|
||||
if local_challenges.exists() and (local_challenges / "abilities").exists():
|
||||
|
||||
@@ -5,7 +5,6 @@ import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .models import Challenge, ChallengeResult
|
||||
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
import asyncio
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
from typing import Union
|
||||
|
||||
from rich.live import Live
|
||||
|
||||
@@ -104,10 +103,11 @@ class BenchmarkHarness:
|
||||
):
|
||||
prev_completed = self.state_manager.get_completed_count()
|
||||
if prev_completed > 0:
|
||||
console.print(
|
||||
f"[yellow]Warning: Config changed from previous run "
|
||||
f"({prev_completed} completed). Use --fresh to start over.[/yellow]"
|
||||
msg = (
|
||||
f"[yellow]Warning: Config changed ({prev_completed} "
|
||||
f"completed). Use --fresh to start over.[/yellow]"
|
||||
)
|
||||
console.print(msg)
|
||||
self.state_manager.reset()
|
||||
|
||||
# Save current config for future mismatch detection
|
||||
|
||||
@@ -36,7 +36,13 @@ class ReportGenerator:
|
||||
tests = {}
|
||||
total_cost = 0.0
|
||||
highest_difficulty = "interface"
|
||||
difficulty_order = ["interface", "basic", "intermediate", "advanced", "hard"]
|
||||
_difficulty_order = [ # noqa: F841
|
||||
"interface",
|
||||
"basic",
|
||||
"intermediate",
|
||||
"advanced",
|
||||
"hard",
|
||||
]
|
||||
|
||||
for result in results:
|
||||
total_cost += result.cost
|
||||
@@ -63,6 +69,17 @@ class ReportGenerator:
|
||||
"reached_cutoff": result.timed_out,
|
||||
"n_steps": result.n_steps,
|
||||
"cost": result.cost,
|
||||
"steps": [
|
||||
{
|
||||
"step_num": step.step_num,
|
||||
"tool_name": step.tool_name,
|
||||
"tool_args": step.tool_args,
|
||||
"result": step.result,
|
||||
"is_error": step.is_error,
|
||||
"cost": step.cumulative_cost,
|
||||
}
|
||||
for step in result.steps
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
@@ -137,6 +154,22 @@ class ReportGenerator:
|
||||
"success": r.success,
|
||||
"n_steps": r.n_steps,
|
||||
"cost": r.cost,
|
||||
"error": r.error_message,
|
||||
"timed_out": r.timed_out,
|
||||
"steps": [
|
||||
{
|
||||
"step": s.step_num,
|
||||
"tool": s.tool_name,
|
||||
"args": s.tool_args,
|
||||
"result": (
|
||||
s.result[:500] + "..."
|
||||
if len(s.result) > 500
|
||||
else s.result
|
||||
),
|
||||
"error": s.is_error,
|
||||
}
|
||||
for s in r.steps
|
||||
],
|
||||
}
|
||||
for r in results
|
||||
},
|
||||
|
||||
@@ -9,7 +9,7 @@ from typing import Callable, Optional
|
||||
|
||||
from autogpt.agent_factory.configurators import create_agent
|
||||
from autogpt.agents.agent import Agent
|
||||
from autogpt.app.config import AppConfig, ConfigBuilder
|
||||
from autogpt.app.config import ConfigBuilder
|
||||
|
||||
from forge.file_storage import FileStorageBackendName, get_storage
|
||||
from forge.llm.providers import MultiProvider
|
||||
@@ -46,7 +46,7 @@ class AgentRunner:
|
||||
start_time = datetime.now()
|
||||
steps: list[StepResult] = []
|
||||
|
||||
# Create isolated workspace for this run (include attempt in name for uniqueness)
|
||||
# Create isolated workspace for this run
|
||||
prefix = f"{challenge.name}_{self.config.strategy}_"
|
||||
if attempt > 1:
|
||||
prefix = f"{challenge.name}_{self.config.strategy}_attempt{attempt}_"
|
||||
|
||||
@@ -7,7 +7,6 @@ from typing import Optional
|
||||
|
||||
from rich.columns import Columns
|
||||
from rich.console import Console, Group, RenderableType
|
||||
from rich.live import Live
|
||||
from rich.panel import Panel
|
||||
from rich.progress import (
|
||||
BarColumn,
|
||||
@@ -236,9 +235,8 @@ class BenchmarkUI:
|
||||
f"[{status_style} bold][{status}][/{status_style} bold] "
|
||||
f"[{color}]{config_name}[/{color}] - {challenge_display}"
|
||||
)
|
||||
console.print(
|
||||
f"[dim]Run ID: {config_name}:{challenge_name}:{result.attempt} @ {timestamp}[/dim]"
|
||||
)
|
||||
run_id = f"{config_name}:{challenge_name}:{result.attempt}"
|
||||
console.print(f"[dim]Run ID: {run_id} @ {timestamp}[/dim]")
|
||||
console.print(f"[{status_style}]{'═' * 70}[/{status_style}]")
|
||||
|
||||
# Print steps
|
||||
@@ -252,9 +250,11 @@ class BenchmarkUI:
|
||||
|
||||
# Print summary
|
||||
console.print()
|
||||
console.print(
|
||||
f" [dim]Steps: {result.n_steps} | Time: {result.run_time_seconds:.1f}s | Cost: ${result.cost:.4f}[/dim]"
|
||||
stats = (
|
||||
f"Steps: {result.n_steps} | Time: {result.run_time_seconds:.1f}s "
|
||||
f"| Cost: ${result.cost:.4f}"
|
||||
)
|
||||
console.print(f" [dim]{stats}[/dim]")
|
||||
|
||||
# Print error if any (skip generic timeout message since status shows it)
|
||||
if result.error_message and result.error_message != "Challenge timed out":
|
||||
@@ -321,11 +321,9 @@ class BenchmarkUI:
|
||||
rows.append(Columns(panels[i : i + max_cols], equal=True, expand=True))
|
||||
content = Group(*rows)
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title=f"[bold]Active Runs ({len(self.active_runs)}/{self.max_parallel})[/bold]",
|
||||
border_style="blue",
|
||||
)
|
||||
active = len(self.active_runs)
|
||||
title = f"[bold]Active Runs ({active}/{self.max_parallel})[/bold]"
|
||||
return Panel(content, title=title, border_style="blue")
|
||||
|
||||
def render_summary_table(self) -> Table:
|
||||
"""Render summary table of results by configuration."""
|
||||
@@ -440,7 +438,9 @@ class BenchmarkUI:
|
||||
total_would_pass = sum(
|
||||
1 for r in self.completed if r.timed_out and r.score >= 0.9
|
||||
)
|
||||
total_failed = len(self.completed) - total_passed - total_would_pass
|
||||
_total_failed = ( # noqa: F841
|
||||
len(self.completed) - total_passed - total_would_pass
|
||||
)
|
||||
total_cost = sum(r.cost for r in self.completed)
|
||||
# Include "would pass" in the effective rate
|
||||
effective_passed = total_passed + total_would_pass
|
||||
@@ -507,8 +507,6 @@ class JsonUI:
|
||||
self.results_by_config[progress.config_name].append(progress.result)
|
||||
|
||||
def print_final_summary(self) -> None:
|
||||
import json
|
||||
|
||||
output = {
|
||||
"results": {
|
||||
config: {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Tests for the web fetch component."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
@@ -7,7 +7,7 @@ Uses trafilatura for intelligent content extraction.
|
||||
|
||||
import logging
|
||||
from typing import Iterator, Literal, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
import trafilatura
|
||||
|
||||
Reference in New Issue
Block a user