feat(classic): add external benchmark adapters for GAIA, SWE-bench, and AgentBench

Integrate standard AI agent benchmarks into the direct_benchmark infrastructure using a plugin-based adapter pattern: - Add BenchmarkAdapter base class with setup(), load_challenges(), and evaluate() - Implement GAIAAdapter for the GAIA benchmark (requires HF token) - Implement SWEBenchAdapter for SWE-bench (requires Docker) - Implement AgentBenchAdapter for AgentBench multi-environment benchmark - Extend HarnessConfig with benchmark options (--benchmark, --benchmark-split, etc.) - Modify ParallelExecutor to use adapter's evaluate() for external benchmarks - Fix runner to record finish step (was being skipped, breaking answer extraction) - Add optional benchmarks dependency group with datasets and huggingface-hub - Increase default benchmark timeout to 900s Usage: poetry run direct-benchmark run \ --benchmark agent-bench \ --benchmark-subset dbbench \ --strategies one_shot \ --models claude Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-08 03:00:28 -04:00 · 2026-01-21 13:04:55 -06:00
parent 6faabef24d
commit 57fbab500b
12 changed files with 2346 additions and 116 deletions
--- a/classic/direct_benchmark/direct_benchmark/main.py
+++ b/classic/direct_benchmark/direct_benchmark/main.py
@@ -201,6 +201,35 @@ def cli():
    is_flag=True,
    help="Enable debug output.",
 )
+@click.option(
+    "--benchmark",
+    "-b",
+    "external_benchmark",
+    default=None,
+    help="Run external benchmark (gaia, swe-bench, agent-bench).",
+)
+@click.option(
+    "--benchmark-split",
+    default="validation",
+    help="Benchmark split (train, validation, test). Default: validation.",
+)
+@click.option(
+    "--benchmark-subset",
+    default=None,
+    help="Benchmark subset (difficulty level '1', repo name, environment).",
+)
+@click.option(
+    "--benchmark-limit",
+    type=int,
+    default=None,
+    help="Maximum number of benchmark challenges to load.",
+)
+@click.option(
+    "--benchmark-cache-dir",
+    type=click.Path(path_type=Path),
+    default=None,
+    help="Cache directory for benchmark datasets.",
+)
 def run(
    strategies: str,
    models: str,
@@ -231,6 +260,11 @@ def run(
    reset_models: tuple[str, ...],
    reset_challenges: tuple[str, ...],
    debug: bool,
+    external_benchmark: Optional[str],
+    benchmark_split: str,
+    benchmark_subset: Optional[str],
+    benchmark_limit: Optional[int],
+    benchmark_cache_dir: Optional[Path],
 ):
    """Run benchmarks with specified configurations."""
    # Handle timeout/cutoff options
@@ -254,15 +288,18 @@ def run(
        console.print(f"Available: {list(MODEL_PRESETS.keys())}")
        sys.exit(1)

-    # Find challenges directory
-    if challenges_dir is None:
+    # Find challenges directory (not required for external benchmarks)
+    if challenges_dir is None and not external_benchmark:
        challenges_dir = find_challenges_dir()
        if challenges_dir is None:
            console.print(
                "[red]Could not find challenges directory. "
-                "Please specify with --challenges-dir[/red]"
+                "Please specify with --challenges-dir or use --benchmark[/red]"
            )
            sys.exit(1)
+    elif challenges_dir is None:
+        # External benchmark - use a placeholder path
+        challenges_dir = Path(".")

    # Set up paths
    if workspace is None:
@@ -308,6 +345,11 @@ def run(
        reset_strategies=list(reset_strategies) if reset_strategies else None,
        reset_models=list(reset_models) if reset_models else None,
        reset_challenges=list(reset_challenges) if reset_challenges else None,
+        external_benchmark=external_benchmark,
+        benchmark_split=benchmark_split,
+        benchmark_subset=benchmark_subset,
+        benchmark_limit=benchmark_limit,
+        benchmark_cache_dir=benchmark_cache_dir,
    )

    # Determine UI mode
@@ -331,7 +373,15 @@ def run(
        console.print(f"Strategies: {strategy_list}")
        console.print(f"Models: {model_list}")
        console.print(f"Parallel: {parallel}")
-        console.print(f"Challenges: {challenges_dir}")
+        if external_benchmark:
+            console.print(f"Benchmark: [cyan]{external_benchmark}[/cyan]")
+            console.print(f"  Split: {benchmark_split}")
+            if benchmark_subset:
+                console.print(f"  Subset: {benchmark_subset}")
+            if benchmark_limit:
+                console.print(f"  Limit: {benchmark_limit}")
+        else:
+            console.print(f"Challenges: {challenges_dir}")
        if categories:
            console.print(f"Categories: {categories}")
        if skip_categories:
@@ -441,6 +491,56 @@ def list_strategies():
        console.print(f"  - {s}")


+@cli.command()
+def list_benchmarks():
+    """List available external benchmarks."""
+    from .adapters import list_adapters
+
+    console.print("\n[bold]Available External Benchmarks[/bold]\n")
+
+    benchmarks = list_adapters()
+    if not benchmarks:
+        console.print("[dim]No benchmarks registered.[/dim]")
+        return
+
+    benchmark_info = {
+        "gaia": {
+            "name": "GAIA",
+            "description": "General AI Assistant Benchmark - reasoning tasks",
+            "splits": "validation, test",
+            "subsets": "1 (easy), 2 (medium), 3 (hard)",
+            "requires": "HF token (gated dataset)",
+        },
+        "swe-bench": {
+            "name": "SWE-bench",
+            "description": "Software Engineering Benchmark - GitHub issues",
+            "splits": "dev, test",
+            "subsets": "full, lite, verified, or repo name",
+            "requires": "Docker, swebench package",
+        },
+        "agent-bench": {
+            "name": "AgentBench",
+            "description": "Multi-environment agent benchmark",
+            "splits": "dev, test",
+            "subsets": "os, db, kg, card_game, ltp, web_shopping, ...",
+            "requires": "Varies by environment (Docker for os)",
+        },
+    }
+
+    for name in sorted(benchmarks):
+        info = benchmark_info.get(name, {})
+        console.print(f"[cyan]{name}[/cyan]:")
+        if info.get("description"):
+            console.print(f"  {info['description']}")
+        if info.get("splits"):
+            console.print(f"  Splits: {info['splits']}")
+        if info.get("subsets"):
+            console.print(f"  Subsets: {info['subsets']}")
+        if info.get("requires"):
+            console.print(f"  Requires: {info['requires']}")
+        console.print()
+
+
@cli.group()
 def state():
    """Manage saved benchmark state (resume/reset)."""
--- a/classic/direct_benchmark/direct_benchmark/adapters/init.py
+++ b/classic/direct_benchmark/direct_benchmark/adapters/init.py
@@ -0,0 +1,57 @@
+"""Benchmark adapter registry and factory."""
+
+from typing import Optional
+
+# Registry of benchmark adapters
+_ADAPTER_REGISTRY: dict[str, type["BenchmarkAdapter"]] = {}  # noqa: F821
+
+
+def register_adapter(name: str):
+    """Decorator to register a benchmark adapter.
+
+    Usage:
+        @register_adapter("gaia")
+        class GAIAAdapter(BenchmarkAdapter):
+            ...
+    """
+
+    def decorator(cls: type["BenchmarkAdapter"]) -> type["BenchmarkAdapter"]:
+        _ADAPTER_REGISTRY[name.lower()] = cls
+        return cls
+
+    return decorator
+
+
+def get_adapter(name: str) -> Optional[type["BenchmarkAdapter"]]:
+    """Get an adapter class by name.
+
+    Args:
+        name: The benchmark name (e.g., "gaia", "swe-bench", "agent-bench").
+
+    Returns:
+        The adapter class, or None if not found.
+    """
+    return _ADAPTER_REGISTRY.get(name.lower())
+
+
+def list_adapters() -> list[str]:
+    """List all registered adapter names."""
+    return list(_ADAPTER_REGISTRY.keys())
+
+
+# Import adapters to trigger registration
+# These imports are at the bottom to avoid circular imports
+from .agent_bench import AgentBenchAdapter  # noqa: E402, F401
+from .base import BenchmarkAdapter  # noqa: E402, F401
+from .gaia import GAIAAdapter  # noqa: E402, F401
+from .swe_bench import SWEBenchAdapter  # noqa: E402, F401
+
+__all__ = [
+    "BenchmarkAdapter",
+    "GAIAAdapter",
+    "SWEBenchAdapter",
+    "AgentBenchAdapter",
+    "register_adapter",
+    "get_adapter",
+    "list_adapters",
+]
--- a/classic/direct_benchmark/direct_benchmark/adapters/agent_bench.py
+++ b/classic/direct_benchmark/direct_benchmark/adapters/agent_bench.py
@@ -0,0 +1,684 @@
+"""AgentBench adapter.
+
+AgentBench evaluates LLMs as agents across diverse real-world environments:
+Operating System, Database, Knowledge Graph, Card Game, Lateral Thinking,
+Web Shopping, Web Browsing, and ALFWorld.
+
+GitHub: https://github.com/THUDM/AgentBench
+Paper: https://arxiv.org/abs/2308.03688
+
+Requires:
+    - Docker (for OS environment)
+    - Database drivers (for DB environment)
+    - Environment-specific dependencies
+"""
+
+import json
+import subprocess
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+from ..models import Challenge, ChallengeResult
+from . import register_adapter
+from .base import BenchmarkAdapter
+
+
+@register_adapter("agent-bench")
+class AgentBenchAdapter(BenchmarkAdapter):
+    """Adapter for the AgentBench benchmark.
+
+    AgentBench includes 8 distinct environments:
+        - os: Operating system tasks in Docker sandbox
+        - db: Database query tasks (SQLite/PostgreSQL)
+        - kg: Knowledge graph reasoning
+        - card_game: Card game strategy (24-point, etc.)
+        - ltp: Lateral thinking puzzles
+        - web_shopping: WebShop navigation
+        - web_browsing: Real web navigation
+        - alfworld: Embodied agent tasks
+
+    Start with: os, db, kg, card_game, ltp (minimal infrastructure)
+
+    Usage:
+        adapter = AgentBenchAdapter(subset="os")
+        for challenge in adapter.load_challenges():
+            # Run challenge...
+    """
+
+    name = "agent-bench"
+    description = "AgentBench - Multi-Environment Agent Benchmark"
+
+    GITHUB_REPO = "THUDM/AgentBench"
+
+    # Environment definitions with requirements
+    # Directory names match the actual AgentBench repo structure
+    ENVIRONMENTS: dict[str, dict[str, Any]] = {
+        "os_interaction": {
+            "name": "Operating System",
+            "difficulty": "medium",
+            "requires": ["docker"],
+            "timeout_multiplier": 1.5,
+        },
+        "dbbench": {
+            "name": "Database",
+            "difficulty": "easy",
+            "requires": [],
+            "timeout_multiplier": 1.0,
+        },
+        "knowledgegraph": {
+            "name": "Knowledge Graph",
+            "difficulty": "medium",
+            "requires": [],
+            "timeout_multiplier": 1.0,
+        },
+        "lateralthinkingpuzzle": {
+            "name": "Lateral Thinking Puzzle",
+            "difficulty": "hard",
+            "requires": [],
+            "timeout_multiplier": 1.0,
+        },
+        "mind2web": {
+            "name": "Mind2Web (Web Browsing)",
+            "difficulty": "hard",
+            "requires": ["playwright"],
+            "timeout_multiplier": 3.0,
+        },
+        "alfworld": {
+            "name": "ALFWorld",
+            "difficulty": "hard",
+            "requires": ["alfworld_server"],
+            "timeout_multiplier": 2.0,
+        },
+        "avalon": {
+            "name": "Avalon (Game)",
+            "difficulty": "medium",
+            "requires": [],
+            "timeout_multiplier": 1.5,
+        },
+    }
+
+    def __init__(
+        self,
+        cache_dir: Optional[Path] = None,
+        split: str = "test",
+        subset: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        """Initialize the AgentBench adapter.
+
+        Args:
+            cache_dir: Directory to cache the dataset.
+            split: Dataset split - "dev" or "test".
+            subset: Environment to use (os, db, kg, card_game, ltp, etc.).
+            limit: Maximum number of challenges to load.
+        """
+        super().__init__(cache_dir, split, subset, limit)
+        self._tasks: dict[str, list[dict[str, Any]]] = {}
+        self._repo_path: Optional[Path] = None
+
+    def setup(self) -> None:
+        """Clone/update AgentBench repository and load tasks."""
+        self._repo_path = self.cache_dir / "agent_bench" / "repo"
+
+        # Clone or update repository
+        if self._repo_path.exists():
+            self._update_repo()
+        else:
+            self._clone_repo()
+
+        # Load tasks from repository
+        self._load_tasks()
+
+        # Check environment requirements
+        if self.subset:
+            self._check_requirements(self.subset)
+
+        self._is_setup = True
+
+    def _clone_repo(self) -> None:
+        """Clone the AgentBench repository."""
+        assert self._repo_path is not None  # Set in setup()
+        self._repo_path.parent.mkdir(parents=True, exist_ok=True)
+
+        result = subprocess.run(
+            [
+                "git",
+                "clone",
+                "--depth",
+                "1",
+                f"https://github.com/{self.GITHUB_REPO}.git",
+                str(self._repo_path),
+            ],
+            capture_output=True,
+            timeout=300,
+        )
+
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to clone AgentBench repository: {result.stderr.decode()}"
+            )
+
+    def _update_repo(self) -> None:
+        """Update the AgentBench repository."""
+        assert self._repo_path is not None  # Set in setup()
+        result = subprocess.run(
+            ["git", "pull", "--rebase"],
+            cwd=str(self._repo_path),
+            capture_output=True,
+            timeout=60,
+        )
+
+        if result.returncode != 0:
+            # Pull failed, try fresh clone
+            import shutil
+
+            shutil.rmtree(self._repo_path)
+            self._clone_repo()
+
+    def _load_tasks(self) -> None:
+        """Load tasks from the repository data files."""
+        if self._repo_path is None:
+            return
+
+        data_dir = self._repo_path / "data"
+
+        if not data_dir.exists():
+            # Try alternative locations
+            for alt_path in ["thudm_data", "tasks", "benchmarks"]:
+                alt_dir = self._repo_path / alt_path
+                if alt_dir.exists():
+                    data_dir = alt_dir
+                    break
+
+        # Load tasks for each environment
+        for env_name in self.ENVIRONMENTS:
+            env_dir = data_dir / env_name
+            if not env_dir.exists():
+                continue
+
+            self._tasks[env_name] = []
+
+            # Try JSON file first
+            tasks_file = env_dir / f"{self.split}.json"
+            if not tasks_file.exists():
+                tasks_file = env_dir / "tasks.json"
+
+            if tasks_file.exists():
+                with open(tasks_file) as f:
+                    self._tasks[env_name] = json.load(f)
+                continue
+
+            # Try JSONL file (AgentBench format)
+            jsonl_file = env_dir / f"{self.split}.jsonl"
+            if not jsonl_file.exists():
+                jsonl_file = env_dir / "standard.jsonl"
+
+            if jsonl_file.exists():
+                with open(jsonl_file) as f:
+                    for line in f:
+                        line = line.strip()
+                        if line:
+                            self._tasks[env_name].append(json.loads(line))
+                continue
+
+            # Try to load from individual task files
+            for task_file in env_dir.glob("*.json"):
+                if task_file.stem not in ("config", "metadata"):
+                    with open(task_file) as f:
+                        data = json.load(f)
+                        if isinstance(data, list):
+                            self._tasks[env_name].extend(data)
+                        else:
+                            self._tasks[env_name].append(data)
+
+    def _check_requirements(self, environment: str) -> None:
+        """Check if required dependencies are available."""
+        env_config = self.ENVIRONMENTS.get(environment, {})
+        requires = env_config.get("requires", [])
+
+        for req in requires:
+            if req == "docker":
+                self._check_docker()
+            elif req == "playwright":
+                self._check_playwright()
+            # Other requirements can be checked as needed
+
+    def _check_docker(self) -> None:
+        """Verify Docker is available."""
+        try:
+            result = subprocess.run(
+                ["docker", "info"],
+                capture_output=True,
+                timeout=10,
+            )
+            if result.returncode != 0:
+                raise RuntimeError("Docker is not running")
+        except FileNotFoundError:
+            raise RuntimeError(
+                "Docker is required for the OS environment. " "Install Docker first."
+            )
+
+    def _check_playwright(self) -> None:
+        """Verify Playwright is available."""
+        try:
+            import playwright  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                "Playwright is required for web_browsing environment. "
+                "Install with: pip install playwright && playwright install"
+            )
+
+    def load_challenges(self) -> Iterator[Challenge]:
+        """Load challenges from the AgentBench dataset.
+
+        Yields:
+            Challenge objects for each AgentBench task.
+        """
+        self.ensure_setup()
+
+        environments = [self.subset] if self.subset else list(self.ENVIRONMENTS.keys())
+
+        count = 0
+        for env_name in environments:
+            if env_name not in self._tasks:
+                continue
+
+            for idx, task in enumerate(self._tasks[env_name]):
+                # Apply limit
+                if self.limit and count >= self.limit:
+                    return
+
+                challenge = self._convert_to_challenge(env_name, idx, task)
+                yield challenge
+                count += 1
+
+    def _convert_to_challenge(
+        self, environment: str, idx: int, task: dict[str, Any]
+    ) -> Challenge:
+        """Convert an AgentBench task to a Challenge."""
+        env_config = self.ENVIRONMENTS[environment]
+
+        # Extract task details (format varies by environment)
+        task_id = task.get("id", task.get("task_id", f"{environment}_{idx}"))
+        description = task.get(
+            "description",
+            task.get("task", task.get("instruction", task.get("question", ""))),
+        )
+
+        # Build task string based on environment
+        task_str = self._format_task(environment, description, task)
+
+        # Get difficulty
+        difficulty = task.get("difficulty", env_config["difficulty"])
+
+        # Calculate timeout
+        base_timeout = 300
+        multiplier = env_config["timeout_multiplier"]
+        cutoff = int(base_timeout * multiplier)
+
+        # Ground truth - extract expected answer based on environment format
+        expected_answer = self._extract_expected_answer(environment, task)
+        ground_truth: dict[str, Any] = {
+            "eval": {"type": f"agent_bench_{environment}"},
+            "environment": environment,
+            "expected": expected_answer,
+            "task_data": task,
+        }
+
+        # Create artifacts directory
+        artifacts_dir = self.cache_dir / "agent_bench" / "artifacts" / task_id
+        artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+        return Challenge(
+            name=f"AgentBench_{task_id}",
+            task=task_str,
+            category=["agent-bench", f"agent-bench_{environment}"],
+            difficulty=difficulty,
+            cutoff=cutoff,
+            ground_truth=ground_truth,
+            artifacts_dir=artifacts_dir,
+            source_path=artifacts_dir / "task.json",
+        )
+
+    def _extract_expected_answer(self, environment: str, task: dict[str, Any]) -> str:
+        """Extract the expected answer from task based on environment format."""
+        # dbbench format: answer in "label" field (list)
+        if environment == "dbbench":
+            label = task.get("label", [])
+            if isinstance(label, list) and label:
+                return str(label[0])
+            return str(label) if label else ""
+
+        # knowledgegraph format: answer in "answer" array with entity_name
+        if environment == "knowledgegraph":
+            answers = task.get("answer", [])
+            if isinstance(answers, list) and answers:
+                first = answers[0]
+                if isinstance(first, dict):
+                    return first.get("entity_name", str(first))
+                return str(first)
+            return ""
+
+        # lateralthinkingpuzzle format
+        if environment == "lateralthinkingpuzzle":
+            return task.get("answer", task.get("solution", ""))
+
+        # Default: try common answer fields
+        for key in ["answer", "expected", "gold", "label", "solution"]:
+            val = task.get(key)
+            if val:
+                if isinstance(val, list):
+                    return str(val[0]) if val else ""
+                return str(val)
+        return ""
+
+    def _format_task(
+        self, environment: str, description: str, task: dict[str, Any]
+    ) -> str:
+        """Format the task description based on environment."""
+        if environment == "os":
+            return (
+                f"Operating System Task\n"
+                f"=====================\n\n"
+                f"{description}\n\n"
+                f"You have access to a Linux command line. Execute commands "
+                f"to complete the task. Save your final answer to 'answer.txt'."
+            )
+
+        elif environment in ("db", "dbbench"):
+            # Extract table information from the task
+            table_info = task.get("table", {})
+            table_name = table_info.get("table_name", "data_table")
+            columns_info = table_info.get("table_info", {}).get("columns", [])
+            rows = table_info.get("table_info", {}).get("rows", [])
+
+            # Format columns
+            col_names = [col.get("name", "") for col in columns_info]
+
+            # Build table display
+            table_str_parts = [
+                f"Table: {table_name}",
+                f"Columns: {', '.join(col_names)}",
+            ]
+            table_str_parts.append("\nData (first 20 rows):")
+            for i, row in enumerate(rows[:20]):
+                row_str = " | ".join(str(cell) for cell in row)
+                table_str_parts.append(f"  {i+1}. {row_str}")
+            if len(rows) > 20:
+                table_str_parts.append(f"  ... ({len(rows) - 20} more rows)")
+
+            table_str = "\n".join(table_str_parts)
+
+            return (
+                f"Database Query Task\n"
+                f"==================\n\n"
+                f"Question: {description}\n\n"
+                f"{table_str}\n\n"
+                f"Analyze the table data above and answer the question. "
+                f"Use the 'finish' command with your answer, or save your answer "
+                f"to 'answer.txt'. Provide only the answer value, not an explanation."
+            )
+
+        elif environment == "kg":
+            kg_info = task.get("kg_info", "")
+            return (
+                f"Knowledge Graph Task\n"
+                f"====================\n\n"
+                f"{description}\n\n"
+                f"Knowledge Graph Information:\n{kg_info}\n\n"
+                f"Reason over the knowledge graph to answer. "
+                f"Save your answer to 'answer.txt'."
+            )
+
+        elif environment == "card_game":
+            numbers = task.get("numbers", [])
+            return (
+                f"Card Game Task (24-point)\n"
+                f"========================\n\n"
+                f"Numbers: {numbers}\n\n"
+                f"Use +, -, *, / and parentheses to make exactly 24. "
+                f"Each number must be used exactly once.\n\n"
+                f"Save your expression to 'answer.txt'."
+            )
+
+        elif environment == "ltp":
+            return (
+                f"Lateral Thinking Puzzle\n"
+                f"======================\n\n"
+                f"{description}\n\n"
+                f"Ask yes/no questions to figure out the answer. "
+                f"Save your final solution to 'answer.txt'."
+            )
+
+        elif environment in ("web_shopping", "web_browsing"):
+            return (
+                f"Web Task ({environment.replace('_', ' ').title()})\n"
+                f"{'=' * 40}\n\n"
+                f"{description}\n\n"
+                f"Navigate the web to complete the task. "
+                f"Save your final answer to 'answer.txt'."
+            )
+
+        elif environment == "alfworld":
+            return (
+                f"ALFWorld Task\n"
+                f"=============\n\n"
+                f"{description}\n\n"
+                f"Navigate and interact with the environment to complete the task. "
+                f"Use available actions to achieve the goal."
+            )
+
+        else:
+            return description
+
+    def evaluate(
+        self,
+        result: ChallengeResult,
+        challenge: Challenge,
+        workspace_dir: Path,
+    ) -> ChallengeResult:
+        """Evaluate using environment-specific logic."""
+        ground = challenge.ground_truth
+        environment = ground["environment"]
+
+        # Extract answer from agent output
+        agent_answer = self._extract_answer(result, environment)
+
+        if not agent_answer:
+            result.success = False
+            result.score = 0.0
+            result.error_message = "No answer found in agent output"
+            return result
+
+        # Environment-specific evaluation
+        expected = ground.get("expected", "")
+
+        if environment == "card_game":
+            eval_result = self._evaluate_card_game(agent_answer, ground["task_data"])
+        elif environment in ("db", "dbbench"):
+            eval_result = self._evaluate_db(agent_answer, expected, ground["task_data"])
+        elif environment in (
+            "os",
+            "os_interaction",
+            "kg",
+            "knowledgegraph",
+            "ltp",
+            "lateralthinkingpuzzle",
+        ):
+            eval_result = self._evaluate_string_match(agent_answer, expected)
+        else:
+            # Default string matching
+            eval_result = self._evaluate_string_match(agent_answer, expected)
+
+        result.success = eval_result["success"]
+        result.score = eval_result["score"]
+        if eval_result.get("error"):
+            result.error_message = eval_result["error"]
+
+        return result
+
+    def _extract_answer(self, result: ChallengeResult, environment: str) -> str:
+        """Extract answer from agent output."""
+        # Look for answer.txt
+        for filename, content in result.output_files.items():
+            if "answer" in filename.lower():
+                return content.strip()
+
+        # Environment-specific extraction
+        if environment in ("db", "dbbench"):
+            for filename, content in result.output_files.items():
+                if filename.endswith(".sql"):
+                    return content.strip()
+
+        # Check if agent used finish command with an answer
+        if result.steps:
+            last_step = result.steps[-1]
+            if last_step.tool_name == "finish":
+                reason = last_step.tool_args.get("reason", "").strip()
+                # Try to extract the actual answer from the finish reason
+                # Often the answer is embedded in the reason
+                if reason:
+                    return reason
+
+        # Look for potential answer in any text file output
+        for filename, content in result.output_files.items():
+            if filename.endswith(".txt") and content.strip():
+                return content.strip()
+
+        return ""
+
+    def _evaluate_card_game(
+        self, answer: str, task_data: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Evaluate 24-point card game answer."""
+        # Store numbers for potential future use in full validation
+        _numbers = task_data.get("numbers", [])  # noqa: F841
+
+        try:
+            # Check that the expression evaluates to 24
+            # and uses all numbers exactly once
+            expr = answer.strip()
+
+            # Safety check - only allow math operations
+            allowed_chars = set("0123456789+-*/() .")
+            if not all(c in allowed_chars for c in expr):
+                return {
+                    "success": False,
+                    "score": 0.0,
+                    "error": "Invalid characters in expression",
+                }
+
+            # Evaluate the expression
+            result = eval(expr)
+
+            if abs(result - 24) < 0.0001:
+                # Check that all numbers are used exactly once
+                # (simplified check - full implementation would parse the expression)
+                return {"success": True, "score": 1.0, "error": None}
+            else:
+                return {
+                    "success": False,
+                    "score": 0.0,
+                    "error": f"Expression evaluates to {result}, not 24",
+                }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "score": 0.0,
+                "error": f"Failed to evaluate expression: {str(e)}",
+            }
+
+    def _evaluate_db(
+        self, query: str, expected: str, task_data: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Evaluate SQL query answer."""
+        # For now, use string matching on the result
+        # Full implementation would execute query and compare results
+        return self._evaluate_string_match(query, expected)
+
+    def _evaluate_string_match(self, actual: str, expected: str) -> dict[str, Any]:
+        """Strict normalized string matching."""
+        actual_norm = actual.lower().strip()
+        expected_norm = expected.lower().strip()
+
+        # If no expected answer, fail (can't evaluate)
+        if not expected_norm:
+            return {
+                "success": False,
+                "score": 0.0,
+                "error": "No expected answer to compare against",
+            }
+
+        # If no actual answer, fail
+        if not actual_norm:
+            return {
+                "success": False,
+                "score": 0.0,
+                "error": f"No answer provided, expected '{expected}'",
+            }
+
+        # Exact match (after normalization)
+        if actual_norm == expected_norm:
+            return {"success": True, "score": 1.0, "error": None}
+
+        # Check if expected is contained in actual (for verbose answers)
+        if expected_norm in actual_norm:
+            return {"success": True, "score": 0.9, "error": None}
+
+        return {
+            "success": False,
+            "score": 0.0,
+            "error": f"Expected '{expected}', got '{actual}'",
+        }
+
+    def provision_environment(self, challenge: Challenge) -> dict[str, Any]:
+        """Set up environment-specific resources."""
+        ground = challenge.ground_truth
+        environment = ground["environment"]
+
+        env_config: dict[str, Any] = {
+            "environment": environment,
+        }
+
+        if environment == "os":
+            # Would spin up Docker container here
+            env_config["docker_image"] = "ubuntu:22.04"
+
+        elif environment == "db":
+            # Set up SQLite database
+            task_data = ground["task_data"]
+            db_setup = task_data.get("db_setup", "")
+            env_config["db_type"] = "sqlite"
+            env_config["db_setup"] = db_setup
+
+        return env_config
+
+    def get_challenge_count(self) -> Optional[int]:
+        """Get the number of challenges."""
+        self.ensure_setup()
+
+        if self.subset:
+            tasks = self._tasks.get(self.subset, [])
+            count = len(tasks)
+        else:
+            count = sum(len(tasks) for tasks in self._tasks.values())
+
+        if self.limit:
+            count = min(count, self.limit)
+
+        return count
+
+    def get_metadata(self) -> dict[str, Any]:
+        """Get AgentBench metadata."""
+        metadata = super().get_metadata()
+        metadata.update(
+            {
+                "repository": f"https://github.com/{self.GITHUB_REPO}",
+                "environments": list(self.ENVIRONMENTS.keys()),
+                "easy_environments": ["db", "card_game", "ltp"],
+                "medium_environments": ["os", "kg", "web_shopping"],
+                "hard_environments": ["web_browsing", "alfworld"],
+            }
+        )
+        return metadata
--- a/classic/direct_benchmark/direct_benchmark/adapters/base.py
+++ b/classic/direct_benchmark/direct_benchmark/adapters/base.py
@@ -0,0 +1,135 @@
+"""Base class for benchmark adapters."""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+from ..models import Challenge, ChallengeResult
+
+
+class BenchmarkAdapter(ABC):
+    """Abstract base class for external benchmark adapters.
+
+    Adapters translate external benchmark formats into the Challenge model
+    used by the direct_benchmark harness.
+
+    Subclasses must implement:
+        - setup(): One-time initialization (download datasets, etc.)
+        - load_challenges(): Yield Challenge objects from the benchmark
+        - evaluate(): Custom evaluation logic for the benchmark
+
+    Optionally override:
+        - provision_environment(): Set up runtime environment for challenges
+        - cleanup(): Clean up resources after benchmark run
+    """
+
+    # Override in subclasses
+    name: str = "base"
+    description: str = "Base benchmark adapter"
+
+    def __init__(
+        self,
+        cache_dir: Optional[Path] = None,
+        split: str = "validation",
+        subset: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            cache_dir: Directory to cache downloaded datasets.
+            split: Dataset split to use (train/validation/test).
+            subset: Optional subset filter (e.g., difficulty level, repo name).
+            limit: Maximum number of challenges to load.
+        """
+        self.cache_dir = cache_dir or Path.home() / ".cache" / "autogpt_benchmarks"
+        self.split = split
+        self.subset = subset
+        self.limit = limit
+        self._is_setup = False
+
+    @abstractmethod
+    def setup(self) -> None:
+        """Perform one-time setup (download datasets, authenticate, etc.).
+
+        This method is called before load_challenges() if not already setup.
+        Should be idempotent - safe to call multiple times.
+        """
+        pass
+
+    @abstractmethod
+    def load_challenges(self) -> Iterator[Challenge]:
+        """Load and yield challenges from the external benchmark.
+
+        Yields:
+            Challenge objects translated from the external format.
+        """
+        pass
+
+    @abstractmethod
+    def evaluate(
+        self,
+        result: ChallengeResult,
+        challenge: Challenge,
+        workspace_dir: Path,
+    ) -> ChallengeResult:
+        """Evaluate a challenge result using benchmark-specific logic.
+
+        Args:
+            result: The result from running the challenge.
+            challenge: The challenge that was run.
+            workspace_dir: Directory containing the agent's output.
+
+        Returns:
+            Updated ChallengeResult with success/score populated.
+        """
+        pass
+
+    def provision_environment(self, challenge: Challenge) -> dict[str, Any]:
+        """Set up runtime environment for a challenge.
+
+        Override this for benchmarks that need Docker containers,
+        database setup, etc.
+
+        Args:
+            challenge: The challenge to provision for.
+
+        Returns:
+            Environment configuration dict (passed to runner).
+        """
+        return {}
+
+    def cleanup(self) -> None:
+        """Clean up resources after benchmark run.
+
+        Override this to stop containers, close connections, etc.
+        """
+        pass
+
+    def ensure_setup(self) -> None:
+        """Ensure setup() has been called."""
+        if not self._is_setup:
+            self.setup()
+            self._is_setup = True
+
+    def get_challenge_count(self) -> Optional[int]:
+        """Get the total number of challenges without loading them.
+
+        Returns:
+            Number of challenges, or None if unknown without loading.
+        """
+        return None
+
+    def get_metadata(self) -> dict[str, Any]:
+        """Get metadata about this benchmark.
+
+        Returns:
+            Dict with benchmark metadata (name, description, splits, etc.).
+        """
+        return {
+            "name": self.name,
+            "description": self.description,
+            "split": self.split,
+            "subset": self.subset,
+            "limit": self.limit,
+        }
--- a/classic/direct_benchmark/direct_benchmark/adapters/gaia.py
+++ b/classic/direct_benchmark/direct_benchmark/adapters/gaia.py
@@ -0,0 +1,318 @@
+"""GAIA benchmark adapter.
+
+GAIA (General AI Assistant Benchmark) evaluates AI assistants on real-world tasks
+requiring reasoning, tool use, and web browsing.
+
+Dataset: https://huggingface.co/datasets/gaia-benchmark/GAIA
+Leaderboard: https://huggingface.co/spaces/gaia-benchmark/leaderboard
+
+Requires:
+    - Hugging Face account with access to the gated dataset
+    - HUGGING_FACE_HUB_TOKEN environment variable set
+    - datasets and huggingface-hub packages
+"""
+
+import os
+import re
+import string
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+from ..models import Challenge, ChallengeResult
+from . import register_adapter
+from .base import BenchmarkAdapter
+
+
+def _normalize_answer(answer: str) -> str:
+    """Normalize answer for comparison (GAIA-style normalization).
+
+    - Lowercase
+    - Remove articles (a, an, the)
+    - Remove punctuation
+    - Collapse whitespace
+    """
+    # Lowercase
+    answer = answer.lower()
+
+    # Remove articles
+    answer = re.sub(r"\b(a|an|the)\b", " ", answer)
+
+    # Remove punctuation
+    answer = answer.translate(str.maketrans("", "", string.punctuation))
+
+    # Collapse whitespace
+    answer = " ".join(answer.split())
+
+    return answer.strip()
+
+
+@register_adapter("gaia")
+class GAIAAdapter(BenchmarkAdapter):
+    """Adapter for the GAIA benchmark.
+
+    GAIA provides real-world tasks at three difficulty levels:
+        - Level 1: Simple tasks (single tool, straightforward reasoning)
+        - Level 2: Moderate tasks (multiple tools, multi-step reasoning)
+        - Level 3: Complex tasks (complex reasoning, tool chaining)
+
+    Usage:
+        adapter = GAIAAdapter(split="validation", subset="1")
+        for challenge in adapter.load_challenges():
+            # Run challenge...
+    """
+
+    name = "gaia"
+    description = "GAIA - General AI Assistant Benchmark"
+
+    HF_DATASET = "gaia-benchmark/GAIA"
+
+    def __init__(
+        self,
+        cache_dir: Optional[Path] = None,
+        split: str = "validation",
+        subset: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        """Initialize the GAIA adapter.
+
+        Args:
+            cache_dir: Directory to cache the dataset.
+            split: Dataset split - "validation" (has answers) or "test" (leaderboard).
+            subset: Difficulty level filter - "1", "2", or "3".
+            limit: Maximum number of challenges to load.
+        """
+        super().__init__(cache_dir, split, subset, limit)
+        self._dataset = None
+        self._file_cache: dict[str, Path] = {}
+
+    def setup(self) -> None:
+        """Download and cache the GAIA dataset from Hugging Face."""
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            raise ImportError(
+                "GAIA adapter requires the 'datasets' package. "
+                "Install with: pip install datasets huggingface-hub"
+            )
+
+        # Check for HF token
+        token = os.environ.get("HUGGING_FACE_HUB_TOKEN") or os.environ.get("HF_TOKEN")
+        if not token:
+            raise ValueError(
+                "GAIA dataset requires authentication. "
+                "Set HUGGING_FACE_HUB_TOKEN or HF_TOKEN environment variable. "
+                "Get your token at https://huggingface.co/settings/tokens"
+            )
+
+        # Load dataset
+        self._dataset = load_dataset(
+            self.HF_DATASET,
+            split=self.split,
+            token=token,
+            cache_dir=str(self.cache_dir / "gaia"),
+        )
+
+        # Download any associated files
+        self._setup_file_cache()
+
+        self._is_setup = True
+
+    def _setup_file_cache(self) -> None:
+        """Cache file attachments from the dataset."""
+        if self._dataset is None:
+            return
+
+        file_dir = self.cache_dir / "gaia" / "files"
+        file_dir.mkdir(parents=True, exist_ok=True)
+
+        for item in self._dataset:
+            if item.get("file_name") and item.get("file_path"):
+                # The dataset includes file contents inline
+                file_name = item["file_name"]
+                self._file_cache[item["task_id"]] = file_dir / file_name
+
+    def load_challenges(self) -> Iterator[Challenge]:
+        """Load challenges from the GAIA dataset.
+
+        Yields:
+            Challenge objects for each GAIA task.
+        """
+        self.ensure_setup()
+
+        if self._dataset is None:
+            return
+
+        count = 0
+        for item in self._dataset:
+            # Apply subset filter (difficulty level)
+            if self.subset and str(item.get("Level")) != self.subset:
+                continue
+
+            # Apply limit
+            if self.limit and count >= self.limit:
+                break
+
+            challenge = self._convert_to_challenge(item)
+            yield challenge
+            count += 1
+
+    def _convert_to_challenge(self, item: dict[str, Any]) -> Challenge:
+        """Convert a GAIA dataset item to a Challenge."""
+        task_id = item["task_id"]
+        question = item["Question"]
+        level = item.get("Level", 1)
+        final_answer = item.get("Final answer", "")
+        file_name = item.get("file_name", "")
+
+        # Build task description
+        task = question
+        if file_name:
+            task = f"{question}\n\nA file has been provided: {file_name}"
+
+        # Map GAIA levels to difficulty
+        difficulty_map = {1: "easy", 2: "medium", 3: "hard"}
+        difficulty = difficulty_map.get(level, "unknown")
+
+        # Cutoff based on difficulty
+        cutoff_map = {1: 180, 2: 300, 3: 600}
+        cutoff = cutoff_map.get(level, 300)
+
+        # Ground truth for evaluation
+        ground_truth: dict[str, Any] = {
+            "answer": final_answer,
+            "eval": {"type": "gaia_match"},
+        }
+
+        # Create artifacts directory for any files
+        artifacts_dir = self.cache_dir / "gaia" / "artifacts" / task_id
+        artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+        # Copy file if present
+        if task_id in self._file_cache:
+            src = self._file_cache[task_id]
+            if src.exists():
+                import shutil
+
+                shutil.copy2(src, artifacts_dir / src.name)
+
+        return Challenge(
+            name=f"GAIA_{task_id}",
+            task=task,
+            category=["gaia", f"gaia_level_{level}"],
+            difficulty=difficulty,
+            cutoff=cutoff,
+            ground_truth=ground_truth,
+            artifacts_dir=artifacts_dir,
+            source_path=artifacts_dir / "data.json",
+        )
+
+    def evaluate(
+        self,
+        result: ChallengeResult,
+        challenge: Challenge,
+        workspace_dir: Path,
+    ) -> ChallengeResult:
+        """Evaluate using GAIA-style normalized string matching.
+
+        GAIA uses exact string matching after normalization:
+        - Lowercase
+        - Remove articles (a, an, the)
+        - Remove punctuation
+        - Collapse whitespace
+        """
+        ground = challenge.ground_truth
+        expected = ground.get("answer", "")
+
+        if not expected:
+            # Test split has no answers - can't evaluate locally
+            result.success = False
+            result.score = 0.0
+            result.error_message = (
+                "No ground truth (test split - submit to leaderboard)"
+            )
+            return result
+
+        # Get the agent's answer from output
+        agent_answer = self._extract_answer(result)
+
+        if not agent_answer:
+            result.success = False
+            result.score = 0.0
+            result.error_message = "No answer found in agent output"
+            return result
+
+        # Normalize both answers
+        normalized_expected = _normalize_answer(expected)
+        normalized_actual = _normalize_answer(agent_answer)
+
+        # Exact match after normalization
+        if normalized_expected == normalized_actual:
+            result.success = True
+            result.score = 1.0
+        else:
+            result.success = False
+            result.score = 0.0
+            result.error_message = (
+                f"Answer mismatch: expected '{expected}', got '{agent_answer}'"
+            )
+
+        return result
+
+    def _extract_answer(self, result: ChallengeResult) -> str:
+        """Extract the final answer from the agent's output.
+
+        Looks for:
+        1. Content in answer.txt file
+        2. Final step result
+        3. Any file with "answer" in the name
+        """
+        # Check for answer.txt
+        for filename, content in result.output_files.items():
+            if "answer" in filename.lower():
+                return content.strip()
+
+        # Check final step result
+        if result.steps:
+            last_step = result.steps[-1]
+            if last_step.tool_name == "finish":
+                # Try to extract answer from finish arguments
+                reason = last_step.tool_args.get("reason", "")
+                return reason.strip()
+
+        return ""
+
+    def get_challenge_count(self) -> Optional[int]:
+        """Get the number of challenges in the dataset."""
+        self.ensure_setup()
+        if self._dataset is None:
+            return None
+
+        count = len(self._dataset)
+
+        # Apply subset filter
+        if self.subset:
+            count = sum(
+                1 for item in self._dataset if str(item.get("Level")) == self.subset
+            )
+
+        # Apply limit
+        if self.limit:
+            count = min(count, self.limit)
+
+        return count
+
+    def get_metadata(self) -> dict[str, Any]:
+        """Get GAIA benchmark metadata."""
+        metadata = super().get_metadata()
+        metadata.update(
+            {
+                "dataset": self.HF_DATASET,
+                "levels": ["1", "2", "3"],
+                "splits": ["validation", "test"],
+                "requires_auth": True,
+                "leaderboard": (
+                    "https://huggingface.co/spaces/gaia-benchmark/leaderboard"
+                ),
+            }
+        )
+        return metadata
--- a/classic/direct_benchmark/direct_benchmark/adapters/swe_bench.py
+++ b/classic/direct_benchmark/direct_benchmark/adapters/swe_bench.py
@@ -0,0 +1,458 @@
+"""SWE-bench adapter.
+
+SWE-bench evaluates AI models on real-world GitHub issues from popular Python
+repositories, requiring models to generate patches that fix the issues.
+
+GitHub: https://github.com/SWE-bench/SWE-bench
+Dataset: https://huggingface.co/datasets/princeton-nlp/SWE-bench
+
+Requires:
+    - Docker Engine (for containerized evaluation)
+    - swebench package (pip install swebench)
+    - ~120GB disk space for full dataset
+    - OR Modal for cloud-based evaluation
+"""
+
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+from ..models import Challenge, ChallengeResult
+from . import register_adapter
+from .base import BenchmarkAdapter
+
+
+@register_adapter("swe-bench")
+class SWEBenchAdapter(BenchmarkAdapter):
+    """Adapter for the SWE-bench benchmark.
+
+    SWE-bench provides 2,294 real GitHub issues from 12 Python repositories.
+    Models must generate patches that fix the issues, evaluated by running
+    the repository's test suite.
+
+    Subsets:
+        - "full": All 2,294 instances
+        - "lite": 300 curated instances
+        - "verified": 500 human-validated solvable instances
+
+    Usage:
+        adapter = SWEBenchAdapter(subset="lite")
+        for challenge in adapter.load_challenges():
+            # Run challenge...
+    """
+
+    name = "swe-bench"
+    description = "SWE-bench - Software Engineering Benchmark"
+
+    HF_DATASET = "princeton-nlp/SWE-bench"
+    HF_LITE = "princeton-nlp/SWE-bench_Lite"
+    HF_VERIFIED = "princeton-nlp/SWE-bench_Verified"
+
+    # Repository-specific timeout multipliers
+    REPO_TIMEOUTS: dict[str, float] = {
+        "django/django": 1.5,
+        "matplotlib/matplotlib": 2.0,
+        "scikit-learn/scikit-learn": 1.5,
+        "sympy/sympy": 1.2,
+    }
+
+    def __init__(
+        self,
+        cache_dir: Optional[Path] = None,
+        split: str = "test",
+        subset: Optional[str] = None,
+        limit: Optional[int] = None,
+        use_modal: bool = False,
+    ):
+        """Initialize the SWE-bench adapter.
+
+        Args:
+            cache_dir: Directory to cache the dataset.
+            split: Dataset split - "dev" or "test".
+            subset: Subset to use - "full", "lite", "verified", or a repo name.
+            limit: Maximum number of challenges to load.
+            use_modal: Use Modal for cloud-based evaluation instead of local Docker.
+        """
+        super().__init__(cache_dir, split, subset, limit)
+        self._dataset = None
+        self._use_modal = use_modal
+
+    def setup(self) -> None:
+        """Download and cache the SWE-bench dataset."""
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            raise ImportError(
+                "SWE-bench adapter requires the 'datasets' package. "
+                "Install with: pip install datasets"
+            )
+
+        # Select dataset based on subset
+        if self.subset == "lite":
+            dataset_name = self.HF_LITE
+        elif self.subset == "verified":
+            dataset_name = self.HF_VERIFIED
+        else:
+            dataset_name = self.HF_DATASET
+
+        # Load dataset
+        self._dataset = load_dataset(
+            dataset_name,
+            split=self.split,
+            cache_dir=str(self.cache_dir / "swe_bench"),
+        )
+
+        # Check for Docker if not using Modal
+        if not self._use_modal:
+            self._check_docker()
+
+        self._is_setup = True
+
+    def _check_docker(self) -> None:
+        """Verify Docker is available for evaluation."""
+        try:
+            result = subprocess.run(
+                ["docker", "info"],
+                capture_output=True,
+                timeout=10,
+            )
+            if result.returncode != 0:
+                raise RuntimeError("Docker is not running")
+        except FileNotFoundError:
+            raise RuntimeError(
+                "Docker is required for SWE-bench evaluation. "
+                "Install Docker or use use_modal=True for cloud evaluation."
+            )
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("Docker is not responding")
+
+    def load_challenges(self) -> Iterator[Challenge]:
+        """Load challenges from the SWE-bench dataset.
+
+        Yields:
+            Challenge objects for each SWE-bench instance.
+        """
+        self.ensure_setup()
+
+        if self._dataset is None:
+            return
+
+        count = 0
+        for item in self._dataset:
+            # Apply repo filter (if subset is a repo name)
+            if self.subset and self.subset not in ("full", "lite", "verified"):
+                if item.get("repo") != self.subset:
+                    continue
+
+            # Apply limit
+            if self.limit and count >= self.limit:
+                break
+
+            challenge = self._convert_to_challenge(item)
+            yield challenge
+            count += 1
+
+    def _convert_to_challenge(self, item: dict[str, Any]) -> Challenge:
+        """Convert a SWE-bench dataset item to a Challenge."""
+        instance_id = item["instance_id"]
+        repo = item.get("repo", "unknown")
+        problem_statement = item.get("problem_statement", "")
+        base_commit = item.get("base_commit", "")
+        hints_text = item.get("hints_text", "")
+
+        # Build comprehensive task description
+        task_parts = [
+            f"Repository: {repo}",
+            f"Base commit: {base_commit}",
+            "",
+            "Problem Statement:",
+            problem_statement,
+        ]
+
+        if hints_text:
+            task_parts.extend(["", "Hints:", hints_text])
+
+        task_parts.extend(
+            [
+                "",
+                "Your task: Generate a patch file (in unified diff format) that "
+                "fixes the issue described above. The patch should be saved to "
+                "'patch.diff' in your workspace.",
+            ]
+        )
+
+        task = "\n".join(task_parts)
+
+        # Determine difficulty based on repo complexity
+        difficulty_map = {
+            "astropy/astropy": "hard",
+            "django/django": "medium",
+            "flask/flask": "easy",
+            "matplotlib/matplotlib": "hard",
+            "pallets/flask": "easy",
+            "psf/requests": "easy",
+            "pydata/xarray": "medium",
+            "pylint-dev/pylint": "medium",
+            "pytest-dev/pytest": "medium",
+            "scikit-learn/scikit-learn": "hard",
+            "sphinx-doc/sphinx": "medium",
+            "sympy/sympy": "hard",
+        }
+        difficulty = difficulty_map.get(repo, "medium")
+
+        # Calculate timeout with repo-specific multiplier
+        base_timeout = 600
+        multiplier = self.REPO_TIMEOUTS.get(repo, 1.0)
+        cutoff = int(base_timeout * multiplier)
+
+        # Ground truth includes the gold patch for reference
+        gold_patch = item.get("patch", "")
+        test_patch = item.get("test_patch", "")
+
+        ground_truth: dict[str, Any] = {
+            "eval": {"type": "swe_bench"},
+            "instance_id": instance_id,
+            "repo": repo,
+            "base_commit": base_commit,
+            "gold_patch": gold_patch,
+            "test_patch": test_patch,
+            "pass_to_pass": item.get("PASS_TO_PASS", ""),
+            "fail_to_pass": item.get("FAIL_TO_PASS", ""),
+        }
+
+        # Create artifacts directory
+        artifacts_dir = self.cache_dir / "swe_bench" / "artifacts" / instance_id
+        artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save problem context for reference
+        context_file = artifacts_dir / "context.json"
+        with open(context_file, "w") as f:
+            json.dump(
+                {
+                    "instance_id": instance_id,
+                    "repo": repo,
+                    "base_commit": base_commit,
+                    "problem_statement": problem_statement,
+                },
+                f,
+                indent=2,
+            )
+
+        return Challenge(
+            name=f"SWE_{instance_id}",
+            task=task,
+            category=["swe-bench", f"swe-bench_{repo.replace('/', '_')}"],
+            difficulty=difficulty,
+            cutoff=cutoff,
+            ground_truth=ground_truth,
+            artifacts_dir=artifacts_dir,
+            source_path=context_file,
+        )
+
+    def evaluate(
+        self,
+        result: ChallengeResult,
+        challenge: Challenge,
+        workspace_dir: Path,
+    ) -> ChallengeResult:
+        """Evaluate using SWE-bench's Docker-based test harness.
+
+        The agent's patch is applied to the repository in a Docker container,
+        and the test suite is run to verify the fix.
+        """
+        ground = challenge.ground_truth
+
+        # Get the generated patch
+        patch_content = self._extract_patch(result)
+
+        if not patch_content:
+            result.success = False
+            result.score = 0.0
+            result.error_message = "No patch.diff found in agent output"
+            return result
+
+        # Run evaluation
+        if self._use_modal:
+            eval_result = self._evaluate_with_modal(ground, patch_content)
+        else:
+            eval_result = self._evaluate_with_docker(ground, patch_content)
+
+        result.success = eval_result["success"]
+        result.score = eval_result["score"]
+        if eval_result.get("error"):
+            result.error_message = eval_result["error"]
+
+        return result
+
+    def _extract_patch(self, result: ChallengeResult) -> str:
+        """Extract the patch from the agent's output."""
+        # Look for patch.diff file
+        for filename, content in result.output_files.items():
+            if filename.endswith("patch.diff") or filename.endswith(".patch"):
+                return content
+
+        # Look for diff content in any output file
+        for filename, content in result.output_files.items():
+            if content.strip().startswith("diff --git") or content.strip().startswith(
+                "---"
+            ):
+                return content
+
+        return ""
+
+    def _evaluate_with_docker(
+        self, ground: dict[str, Any], patch: str
+    ) -> dict[str, Any]:
+        """Run evaluation using local Docker."""
+        try:
+            # Try to import swebench harness
+            from swebench.harness.run_evaluation import run_evaluation
+        except ImportError:
+            return {
+                "success": False,
+                "score": 0.0,
+                "error": (
+                    "swebench package not installed. "
+                    "Install with: pip install swebench"
+                ),
+            }
+
+        instance_id = ground["instance_id"]
+        # These are available for future use in more sophisticated evaluation
+        _repo = ground["repo"]  # noqa: F841
+        _base_commit = ground["base_commit"]  # noqa: F841
+
+        # Write patch to temp file
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".patch", delete=False) as f:
+            f.write(patch)
+            patch_file = f.name
+
+        # Initialize predictions_file path before try block
+        predictions_file = Path(patch_file).with_suffix(".json")
+
+        try:
+            # Create predictions file for swebench
+            predictions = [
+                {
+                    "instance_id": instance_id,
+                    "model_name_or_path": "autogpt",
+                    "model_patch": patch,
+                }
+            ]
+
+            with open(predictions_file, "w") as f:
+                json.dump(predictions, f)
+
+            # Run evaluation
+            results = run_evaluation(
+                predictions_path=str(predictions_file),
+                swe_bench_tasks=self.HF_DATASET,
+                log_dir=str(self.cache_dir / "swe_bench" / "logs"),
+                testbed=str(self.cache_dir / "swe_bench" / "testbed"),
+                skip_existing=False,
+                timeout=1800,
+                verbose=False,
+            )
+
+            # Check results
+            if instance_id in results:
+                instance_result = results[instance_id]
+                resolved = instance_result.get("resolved", False)
+                return {
+                    "success": resolved,
+                    "score": 1.0 if resolved else 0.0,
+                    "error": None if resolved else "Tests did not pass",
+                }
+            else:
+                return {
+                    "success": False,
+                    "score": 0.0,
+                    "error": "Evaluation did not produce results",
+                }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "score": 0.0,
+                "error": f"Evaluation failed: {str(e)}",
+            }
+
+        finally:
+            # Cleanup temp files
+            Path(patch_file).unlink(missing_ok=True)
+            predictions_file.unlink(missing_ok=True)
+
+    def _evaluate_with_modal(
+        self, ground: dict[str, Any], patch: str
+    ) -> dict[str, Any]:
+        """Run evaluation using Modal cloud infrastructure."""
+        try:
+            import modal  # noqa: F401
+        except ImportError:
+            return {
+                "success": False,
+                "score": 0.0,
+                "error": (
+                    "Modal package not installed. " "Install with: pip install modal"
+                ),
+            }
+
+        # Modal evaluation requires environment setup
+        # This is a simplified interface - full implementation would use
+        # modal's SWE-bench harness
+        return {
+            "success": False,
+            "score": 0.0,
+            "error": (
+                "Modal evaluation not yet implemented. "
+                "Use local Docker evaluation or submit to SWE-bench leaderboard."
+            ),
+        }
+
+    def provision_environment(self, challenge: Challenge) -> dict[str, Any]:
+        """Provide repository context for the challenge."""
+        ground = challenge.ground_truth
+        return {
+            "repo": ground.get("repo"),
+            "base_commit": ground.get("base_commit"),
+            "clone_url": f"https://github.com/{ground.get('repo')}.git",
+        }
+
+    def get_challenge_count(self) -> Optional[int]:
+        """Get the number of challenges in the dataset."""
+        self.ensure_setup()
+        if self._dataset is None:
+            return None
+
+        count = len(self._dataset)
+
+        # Apply repo filter
+        if self.subset and self.subset not in ("full", "lite", "verified"):
+            count = sum(1 for item in self._dataset if item.get("repo") == self.subset)
+
+        # Apply limit
+        if self.limit:
+            count = min(count, self.limit)
+
+        return count
+
+    def get_metadata(self) -> dict[str, Any]:
+        """Get SWE-bench metadata."""
+        metadata = super().get_metadata()
+        metadata.update(
+            {
+                "datasets": {
+                    "full": self.HF_DATASET,
+                    "lite": self.HF_LITE,
+                    "verified": self.HF_VERIFIED,
+                },
+                "subsets": ["full", "lite", "verified"]
+                + list(self.REPO_TIMEOUTS.keys()),
+                "splits": ["dev", "test"],
+                "requires_docker": not self._use_modal,
+                "leaderboard": "https://www.swebench.com/",
+            }
+        )
+        return metadata
--- a/classic/direct_benchmark/direct_benchmark/harness.py
+++ b/classic/direct_benchmark/direct_benchmark/harness.py
@@ -3,7 +3,8 @@
 import asyncio
 import re
 from datetime import datetime
-from typing import Union
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Union

 from rich.live import Live

@@ -14,16 +15,55 @@ from .report import ReportGenerator
 from .state import StateManager
 from .ui import BenchmarkUI, JsonUI, QuietUI, console

+if TYPE_CHECKING:
+    from .adapters.base import BenchmarkAdapter
+

 class BenchmarkHarness:
    """Main benchmark harness orchestrator."""

    def __init__(self, config: HarnessConfig):
        self.config = config
-        self.loader = ChallengeLoader(config.challenges_dir)
        self.reporter = ReportGenerator(config.reports_dir)
        self.state_manager = StateManager(config.reports_dir)

+        # Initialize challenge source (adapter or loader)
+        self.adapter: Optional["BenchmarkAdapter"] = None
+        self.loader: Optional[ChallengeLoader] = None
+
+        if config.external_benchmark:
+            self._init_adapter()
+        else:
+            self.loader = ChallengeLoader(config.challenges_dir)
+
+    def _init_adapter(self) -> None:
+        """Initialize external benchmark adapter."""
+        from .adapters import get_adapter
+
+        assert self.config.external_benchmark is not None
+        adapter_cls = get_adapter(self.config.external_benchmark)
+        if adapter_cls is None:
+            from .adapters import list_adapters
+
+            available = list_adapters()
+            raise ValueError(
+                f"Unknown benchmark: {self.config.external_benchmark}. "
+                f"Available: {available}"
+            )
+
+        # Determine cache directory
+        cache_dir = self.config.benchmark_cache_dir
+        if cache_dir is None:
+            cache_dir = Path.home() / ".cache" / "autogpt_benchmarks"
+
+        # Create adapter instance
+        self.adapter = adapter_cls(
+            cache_dir=cache_dir,
+            split=self.config.benchmark_split,
+            subset=self.config.benchmark_subset,
+            limit=self.config.benchmark_limit,
+        )
+
    async def run(
        self,
        ui_mode: str = "default",
@@ -115,17 +155,40 @@ class BenchmarkHarness:
            strategy_names, model_names, self.config.attempts
        )

-        # Load challenges
-        challenges = list(
-            self.loader.load_all(
-                categories=self.config.categories,
-                skip_categories=self.config.skip_categories,
-                names=self.config.test_names,
-                maintain=self.config.maintain,
-                improve=self.config.improve,
-                explore=self.config.explore,
+        # Load challenges (from adapter or local loader)
+        if self.adapter:
+            # External benchmark - load via adapter
+            if ui_mode != "json":
+                subset_str = (
+                    f", subset={self.config.benchmark_subset}"
+                    if self.config.benchmark_subset
+                    else ""
+                )
+                limit_str = (
+                    f", limit={self.config.benchmark_limit}"
+                    if self.config.benchmark_limit
+                    else ""
+                )
+                console.print(
+                    f"[cyan]Loading {self.config.external_benchmark} benchmark "
+                    f"(split={self.config.benchmark_split}{subset_str}{limit_str})"
+                    f"...[/cyan]"
+                )
+            assert self.adapter is not None
+            challenges = list(self.adapter.load_challenges())
+        else:
+            # Local challenges - load via ChallengeLoader
+            assert self.loader is not None
+            challenges = list(
+                self.loader.load_all(
+                    categories=self.config.categories,
+                    skip_categories=self.config.skip_categories,
+                    names=self.config.test_names,
+                    maintain=self.config.maintain,
+                    improve=self.config.improve,
+                    explore=self.config.explore,
+                )
            )
-        )

        if not challenges:
            console.print("[red]No challenges found matching filters[/red]")
@@ -194,6 +257,7 @@ class BenchmarkHarness:
            attempts=self.config.attempts,
            no_cutoff=self.config.no_cutoff,
            skip_fn=should_skip,
+            adapter=self.adapter,
        )

        # Ensure workspace exists
--- a/classic/direct_benchmark/direct_benchmark/models.py
+++ b/classic/direct_benchmark/direct_benchmark/models.py
@@ -205,7 +205,7 @@ class BenchmarkConfig(BaseModel):
    strategy: StrategyName
    model: ModelConfig
    max_steps: int = 50
-    timeout_seconds: int = 300
+    timeout_seconds: int = 900

    @property
    def config_name(self) -> str:
@@ -246,6 +246,13 @@ class HarnessConfig(BaseModel):
    reset_models: Optional[list[str]] = None  # Reset specific models
    reset_challenges: Optional[list[str]] = None  # Reset specific challenges

+    # External benchmark options
+    external_benchmark: Optional[str] = None  # gaia, swe-bench, agent-bench
+    benchmark_split: str = "validation"  # train, validation, test
+    benchmark_subset: Optional[str] = None  # Difficulty level, repo name, etc.
+    benchmark_limit: Optional[int] = None  # Max challenges to load
+    benchmark_cache_dir: Optional[Path] = None  # Cache directory for downloads
+
    model_config = {"arbitrary_types_allowed": True}


--- a/classic/direct_benchmark/direct_benchmark/parallel.py
+++ b/classic/direct_benchmark/direct_benchmark/parallel.py
@@ -2,12 +2,15 @@

 import asyncio
 from pathlib import Path
-from typing import AsyncIterator, Callable, Optional
+from typing import TYPE_CHECKING, AsyncIterator, Callable, Optional

 from .evaluator import Evaluator
 from .models import BenchmarkConfig, Challenge, ChallengeResult, ExecutionProgress
 from .runner import AgentRunner, StepCallback

+if TYPE_CHECKING:
+    from .adapters.base import BenchmarkAdapter
+
 # Type for skip predicate: (config_name, challenge_name, attempt) -> bool
 SkipPredicate = Callable[[str, str, int], bool]

@@ -23,6 +26,7 @@ class ParallelExecutor:
        attempts: int = 1,
        no_cutoff: bool = False,
        skip_fn: Optional[SkipPredicate] = None,
+        adapter: Optional["BenchmarkAdapter"] = None,
    ):
        self.max_parallel = max_parallel
        self.on_progress = on_progress
@@ -30,6 +34,7 @@ class ParallelExecutor:
        self.attempts = attempts
        self.no_cutoff = no_cutoff
        self.skip_fn = skip_fn
+        self.adapter = adapter
        self._semaphore = asyncio.Semaphore(max_parallel)
        self._evaluator = Evaluator()

@@ -121,8 +126,11 @@ class ParallelExecutor:
            )
            result = await runner.run_challenge(challenge, attempt=attempt)

-            # Evaluate result
-            result = self._evaluator.evaluate(result, challenge)
+            # Evaluate result - use adapter if available, otherwise standard evaluator
+            if self.adapter is not None:
+                result = self.adapter.evaluate(result, challenge, workspace_root)
+            else:
+                result = self._evaluator.evaluate(result, challenge)

            # Notify completion
            if self.on_progress:
--- a/classic/direct_benchmark/direct_benchmark/runner.py
+++ b/classic/direct_benchmark/direct_benchmark/runner.py
@@ -220,6 +220,14 @@ class AgentRunner:
                "service",  # Block service commands
            ]

+        # Disable clipboard commands for benchmarks - they add overhead without value
+        app_config.disabled_commands = [
+            "clipboard_copy",
+            "clipboard_paste",
+            "clipboard_list",
+            "clipboard_clear",
+        ]
+
        self._agent = agent
        self._llm_provider = llm_provider
        return agent
@@ -244,14 +252,28 @@ class AgentRunner:
                # Propose next action
                proposal = await agent.propose_action()

-                # Check for finish command
+                # Get cumulative cost from LLM provider
+                if self._llm_provider:
+                    cumulative_cost = self._llm_provider.get_incurred_cost()
+
+                # Check for finish command - record it and return
                if proposal.use_tool.name == "finish":
+                    steps.append(
+                        StepResult(
+                            step_num=step_num + 1,
+                            tool_name=proposal.use_tool.name,
+                            tool_args=proposal.use_tool.arguments,
+                            result="Agent finished",
+                            is_error=False,
+                            cumulative_cost=cumulative_cost,
+                        )
+                    )
                    return True

                # Execute the action
                result = await agent.execute(proposal)

-                # Get cumulative cost from LLM provider
+                # Update cost after execution
                if self._llm_provider:
                    cumulative_cost = self._llm_provider.get_incurred_cost()

--- a/classic/poetry.lock
+++ b/classic/poetry.lock
@@ -6,7 +6,7 @@ version = "2.6.1"
 description = "Happy Eyeballs for asyncio"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"},
    {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"},
@@ -18,7 +18,7 @@ version = "3.13.3"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "aiohttp-3.13.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d5a372fd5afd301b3a89582817fdcdb6c34124787c70dbcc616f259013e7eef7"},
    {file = "aiohttp-3.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:147e422fd1223005c22b4fe080f5d93ced44460f5f9c105406b753612b587821"},
@@ -175,7 +175,7 @@ version = "1.4.0"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"},
    {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"},
@@ -247,7 +247,7 @@ version = "25.4.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373"},
    {file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"},
@@ -1197,7 +1197,7 @@ version = "2026.1.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c"},
    {file = "certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120"},
@@ -1319,7 +1319,7 @@ version = "3.4.4"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "charset_normalizer-3.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e824f1492727fa856dd6eda4f7cee25f8518a12f3c4a56a74e8095695089cf6d"},
    {file = "charset_normalizer-3.4.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4bd5d4137d500351a30687c2d3971758aac9a19208fc110ccb9d7188fbe709e8"},
@@ -1523,12 +1523,12 @@ version = "0.4.6"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
-markers = {dev = "platform_system == \"Windows\" or sys_platform == \"win32\""}
+markers = {benchmarks = "platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\""}

 [[package]]
 name = "coloredlogs"
@@ -1874,6 +1874,51 @@ files = [
    {file = "cymem-2.0.13.tar.gz", hash = "sha256:1c91a92ae8c7104275ac26bd4d29b08ccd3e7faff5893d3858cb6fadf1bc1588"},
 ]

+[[package]]
+name = "datasets"
+version = "2.18.0"
+description = "HuggingFace community-driven open-source library of datasets"
+optional = false
+python-versions = ">=3.8.0"
+groups = ["benchmarks"]
+files = [
+    {file = "datasets-2.18.0-py3-none-any.whl", hash = "sha256:f1bbf0e2896917a914de01cbd37075b14deea3837af87ad0d9f697388ccaeb50"},
+    {file = "datasets-2.18.0.tar.gz", hash = "sha256:cdf8b8c6abf7316377ba4f49f9589a4c74556d6b481afd0abd2284f3d69185cb"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+dill = ">=0.3.0,<0.3.9"
+filelock = "*"
+fsspec = {version = ">=2023.1.0,<=2024.2.0", extras = ["http"]}
+huggingface-hub = ">=0.19.4"
+multiprocess = "*"
+numpy = ">=1.17"
+packaging = "*"
+pandas = "*"
+pyarrow = ">=12.0.0"
+pyarrow-hotfix = "*"
+pyyaml = ">=5.1"
+requests = ">=2.19.0"
+tqdm = ">=4.62.1"
+xxhash = "*"
+
+[package.extras]
+apache-beam = ["apache-beam (>=2.26.0)"]
+audio = ["librosa", "soundfile (>=0.12.1)"]
+benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"]
+dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0) ; sys_platform != \"win32\" and python_version < \"3.10\"", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14) ; sys_platform != \"win32\"", "jaxlib (>=0.3.14) ; sys_platform != \"win32\"", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1) ; sys_platform != \"darwin\" or platform_machine != \"arm64\"", "tensorflow (>=2.3,!=2.6.0,!=2.6.1) ; sys_platform != \"darwin\" or platform_machine != \"arm64\"", "tensorflow-macos ; sys_platform == \"darwin\" and platform_machine == \"arm64\"", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"]
+docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1) ; sys_platform != \"darwin\" or platform_machine != \"arm64\"", "tensorflow-macos ; sys_platform == \"darwin\" and platform_machine == \"arm64\"", "torch", "transformers"]
+jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"]
+metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
+quality = ["ruff (>=0.3.0)"]
+s3 = ["s3fs"]
+tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1) ; sys_platform != \"darwin\" or platform_machine != \"arm64\"", "tensorflow-macos ; sys_platform == \"darwin\" and platform_machine == \"arm64\""]
+tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"]
+tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0) ; sys_platform != \"win32\" and python_version < \"3.10\"", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14) ; sys_platform != \"win32\"", "jaxlib (>=0.3.14) ; sys_platform != \"win32\"", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.3,!=2.6.0,!=2.6.1) ; sys_platform != \"darwin\" or platform_machine != \"arm64\"", "tensorflow-macos ; sys_platform == \"darwin\" and platform_machine == \"arm64\"", "tiktoken", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"]
+torch = ["torch"]
+vision = ["Pillow (>=6.2.1)"]
+
 [[package]]
 name = "dateparser"
 version = "1.2.2"
@@ -1930,6 +1975,22 @@ files = [
    {file = "demjson3-3.0.6.tar.gz", hash = "sha256:37c83b0c6eb08d25defc88df0a2a4875d58a7809a9650bd6eee7afd8053cdbac"},
 ]

+[[package]]
+name = "dill"
+version = "0.3.8"
+description = "serialize all of Python"
+optional = false
+python-versions = ">=3.8"
+groups = ["benchmarks"]
+files = [
+    {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"},
+    {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"},
+]
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+profile = ["gprof2dot (>=2022.7.29)"]
+
 [[package]]
 name = "distlib"
 version = "0.4.0"
@@ -2181,7 +2242,7 @@ version = "3.20.3"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.10"
-groups = ["main", "build", "dev"]
+groups = ["main", "benchmarks", "build", "dev"]
 files = [
    {file = "filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1"},
    {file = "filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1"},
@@ -2222,7 +2283,7 @@ version = "1.8.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011"},
    {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef2b7b394f208233e471abc541cc6991f907ffd47dc72584acee3147899d6565"},
@@ -2358,25 +2419,27 @@ files = [

 [[package]]
 name = "fsspec"
-version = "2026.1.0"
+version = "2024.2.0"
 description = "File-system specification"
 optional = false
-python-versions = ">=3.10"
-groups = ["main"]
+python-versions = ">=3.8"
+groups = ["main", "benchmarks"]
 files = [
-    {file = "fsspec-2026.1.0-py3-none-any.whl", hash = "sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc"},
-    {file = "fsspec-2026.1.0.tar.gz", hash = "sha256:e987cb0496a0d81bba3a9d1cee62922fb395e7d4c3b575e57f547953334fe07b"},
+    {file = "fsspec-2024.2.0-py3-none-any.whl", hash = "sha256:817f969556fa5916bc682e02ca2045f96ff7f586d45110fcb76022063ad2c7d8"},
+    {file = "fsspec-2024.2.0.tar.gz", hash = "sha256:b6ad1a679f760dda52b1168c859d01b7b80648ea6f7f7c7f5a8a91dc3f3ecb84"},
 ]

+[package.dependencies]
+aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""}
+
 [package.extras]
 abfs = ["adlfs"]
 adl = ["adlfs"]
 arrow = ["pyarrow (>=1)"]
 dask = ["dask", "distributed"]
-dev = ["pre-commit", "ruff (>=0.5)"]
-doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
+devel = ["pytest", "pytest-cov"]
 dropbox = ["dropbox", "dropboxdrivefs", "requests"]
-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs (>2024.2.0)", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs (>2024.2.0)", "smbprotocol", "tqdm"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
 fuse = ["fusepy"]
 gcs = ["gcsfs"]
 git = ["pygit2"]
@@ -2391,9 +2454,6 @@ s3 = ["s3fs"]
 sftp = ["paramiko"]
 smb = ["smbprotocol"]
 ssh = ["paramiko"]
-test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
-test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
-test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "backports-zstd ; python_version < \"3.14\"", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr"]
 tqdm = ["tqdm"]

 [[package]]
@@ -2977,42 +3037,6 @@ files = [
 hpack = ">=4.1,<5"
 hyperframe = ">=6.1,<7"

-[[package]]
-name = "hf-xet"
-version = "1.2.0"
-description = "Fast transfer of large files with the Hugging Face Hub."
-optional = false
-python-versions = ">=3.8"
-groups = ["main"]
-markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""
-files = [
-    {file = "hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649"},
-    {file = "hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813"},
-    {file = "hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc"},
-    {file = "hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5"},
-    {file = "hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f"},
-    {file = "hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832"},
-    {file = "hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f"},
-    {file = "hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc"},
-    {file = "hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848"},
-    {file = "hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4"},
-    {file = "hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd"},
-    {file = "hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c"},
-    {file = "hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737"},
-    {file = "hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865"},
-    {file = "hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69"},
-    {file = "hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f"},
-]
-
-[package.extras]
-tests = ["pytest"]
-
 [[package]]
 name = "hpack"
 version = "4.1.0"
@@ -3170,39 +3194,36 @@ zstd = ["zstandard (>=0.18.0)"]

 [[package]]
 name = "huggingface-hub"
-version = "1.3.2"
+version = "0.20.3"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
-python-versions = ">=3.9.0"
-groups = ["main"]
+python-versions = ">=3.8.0"
+groups = ["main", "benchmarks"]
 files = [
-    {file = "huggingface_hub-1.3.2-py3-none-any.whl", hash = "sha256:b552b9562a5532102a041fa31a6966bb9de95138fc7aa578bb3703198c25d1b6"},
-    {file = "huggingface_hub-1.3.2.tar.gz", hash = "sha256:15d7902e154f04174a0816d1e9594adcf15cdad57596920a5dc70fadb5d896c7"},
+    {file = "huggingface_hub-0.20.3-py3-none-any.whl", hash = "sha256:d988ae4f00d3e307b0c80c6a05ca6dbb7edba8bba3079f74cda7d9c2e562a7b6"},
+    {file = "huggingface_hub-0.20.3.tar.gz", hash = "sha256:94e7f8e074475fbc67d6a71957b678e1b4a74ff1b64a644fd6cbb83da962d05d"},
 ]

 [package.dependencies]
 filelock = "*"
 fsspec = ">=2023.5.0"
-hf-xet = {version = ">=1.2.0,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""}
-httpx = ">=0.23.0,<1"
 packaging = ">=20.9"
 pyyaml = ">=5.1"
-shellingham = "*"
+requests = "*"
 tqdm = ">=4.42.1"
-typer-slim = "*"
-typing-extensions = ">=4.1.0"
+typing-extensions = ">=3.7.4.3"

 [package.extras]
-all = ["Jinja2", "Pillow", "authlib (>=1.3.2)", "fastapi", "fastapi", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0)", "numpy", "pytest (>=8.4.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "ty", "types-PyYAML", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-dev = ["Jinja2", "Pillow", "authlib (>=1.3.2)", "fastapi", "fastapi", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0)", "numpy", "pytest (>=8.4.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "ty", "types-PyYAML", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0) ; python_version == \"3.8\"", "pydantic (>1.1,<3.0) ; python_version > \"3.8\"", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+cli = ["InquirerPy (==0.3.4)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0) ; python_version == \"3.8\"", "pydantic (>1.1,<3.0) ; python_version > \"3.8\"", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-hf-xet = ["hf-xet (>=1.2.0,<2.0.0)"]
-mcp = ["mcp (>=1.8.0)"]
-oauth = ["authlib (>=1.3.2)", "fastapi", "httpx", "itsdangerous"]
-quality = ["libcst (>=1.4.0)", "mypy (==1.15.0)", "ruff (>=0.9.0)", "ty"]
-testing = ["Jinja2", "Pillow", "authlib (>=1.3.2)", "fastapi", "fastapi", "httpx", "itsdangerous", "jedi", "numpy", "pytest (>=8.4.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["safetensors[torch]", "torch"]
-typing = ["types-PyYAML", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
+inference = ["aiohttp", "pydantic (>1.1,<2.0) ; python_version == \"3.8\"", "pydantic (>1.1,<3.0) ; python_version > \"3.8\""]
+quality = ["mypy (==1.5.1)", "ruff (>=0.1.3)"]
+tensorflow = ["graphviz", "pydot", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic (>1.1,<2.0) ; python_version == \"3.8\"", "pydantic (>1.1,<3.0) ; python_version > \"3.8\"", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]

 [[package]]
 name = "humanfriendly"
@@ -3276,7 +3297,7 @@ version = "3.11"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"},
    {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"},
@@ -4187,7 +4208,7 @@ version = "6.7.0"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9f474ad5acda359c8758c8accc22032c6abe6dc87a8be2440d097785e27a9349"},
    {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a9db5a870f780220e931d0002bbfd88fb53aceb6293251e2c839415c1b20e"},
@@ -4337,6 +4358,31 @@ files = [
    {file = "multidict-6.7.0.tar.gz", hash = "sha256:c6e99d9a65ca282e578dfea819cfa9c0a62b2499d8677392e09feaf305e9e6f5"},
 ]

+[[package]]
+name = "multiprocess"
+version = "0.70.16"
+description = "better multiprocessing and multithreading in Python"
+optional = false
+python-versions = ">=3.8"
+groups = ["benchmarks"]
+files = [
+    {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"},
+    {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"},
+    {file = "multiprocess-0.70.16-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37b55f71c07e2d741374998c043b9520b626a8dddc8b3129222ca4f1a06ef67a"},
+    {file = "multiprocess-0.70.16-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba8c31889abf4511c7308a8c52bb4a30b9d590e7f58523302ba00237702ca054"},
+    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41"},
+    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a"},
+    {file = "multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02"},
+    {file = "multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a"},
+    {file = "multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e"},
+    {file = "multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435"},
+    {file = "multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3"},
+    {file = "multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1"},
+]
+
+[package.dependencies]
+dill = ">=0.3.8"
+
 [[package]]
 name = "murmurhash"
 version = "1.0.12"
@@ -4502,7 +4548,7 @@ version = "2.0.2"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
-groups = ["main"]
+groups = ["main", "benchmarks"]
 markers = "python_version >= \"3.14\""
 files = [
    {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"},
@@ -4558,7 +4604,7 @@ version = "2.4.1"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.11"
-groups = ["main"]
+groups = ["main", "benchmarks"]
 markers = "python_version < \"3.14\""
 files = [
    {file = "numpy-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0cce2a669e3c8ba02ee563c7835f92c153cf02edff1ae05e1823f1dde21b16a5"},
@@ -4941,12 +4987,108 @@ version = "25.0"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
    {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
 ]

+[[package]]
+name = "pandas"
+version = "2.3.3"
+description = "Powerful data structures for data analysis, time series, and statistics"
+optional = false
+python-versions = ">=3.9"
+groups = ["benchmarks"]
+files = [
+    {file = "pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c"},
+    {file = "pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a"},
+    {file = "pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1"},
+    {file = "pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838"},
+    {file = "pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250"},
+    {file = "pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4"},
+    {file = "pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826"},
+    {file = "pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523"},
+    {file = "pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45"},
+    {file = "pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66"},
+    {file = "pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b"},
+    {file = "pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791"},
+    {file = "pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151"},
+    {file = "pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c"},
+    {file = "pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53"},
+    {file = "pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35"},
+    {file = "pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908"},
+    {file = "pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89"},
+    {file = "pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98"},
+    {file = "pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084"},
+    {file = "pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b"},
+    {file = "pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713"},
+    {file = "pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8"},
+    {file = "pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d"},
+    {file = "pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac"},
+    {file = "pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c"},
+    {file = "pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493"},
+    {file = "pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee"},
+    {file = "pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5"},
+    {file = "pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21"},
+    {file = "pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78"},
+    {file = "pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110"},
+    {file = "pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86"},
+    {file = "pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc"},
+    {file = "pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0"},
+    {file = "pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593"},
+    {file = "pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c"},
+    {file = "pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b"},
+    {file = "pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6"},
+    {file = "pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3"},
+    {file = "pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5"},
+    {file = "pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec"},
+    {file = "pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7"},
+    {file = "pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450"},
+    {file = "pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5"},
+    {file = "pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788"},
+    {file = "pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87"},
+    {file = "pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2"},
+    {file = "pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8"},
+    {file = "pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff"},
+    {file = "pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29"},
+    {file = "pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73"},
+    {file = "pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9"},
+    {file = "pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa"},
+    {file = "pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b"},
+]
+
+[package.dependencies]
+numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""}
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.7"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
+
 [[package]]
 name = "patchelf"
 version = "0.17.2.4"
@@ -5381,7 +5523,7 @@ version = "0.4.1"
 description = "Accelerated property cache"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db"},
    {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1eb2994229cc8ce7fe9b3db88f5465f5fd8651672840b2e426b88cdb1a30aac8"},
@@ -5545,6 +5687,78 @@ files = [
    {file = "protobuf-6.33.4.tar.gz", hash = "sha256:dc2e61bca3b10470c1912d166fe0af67bfc20eb55971dcef8dfa48ce14f0ed91"},
 ]

+[[package]]
+name = "pyarrow"
+version = "23.0.0"
+description = "Python library for Apache Arrow"
+optional = false
+python-versions = ">=3.10"
+groups = ["benchmarks"]
+files = [
+    {file = "pyarrow-23.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:cbdc2bf5947aa4d462adcf8453cf04aee2f7932653cb67a27acd96e5e8528a67"},
+    {file = "pyarrow-23.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4d38c836930ce15cd31dce20114b21ba082da231c884bdc0a7b53e1477fe7f07"},
+    {file = "pyarrow-23.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:4222ff8f76919ecf6c716175a0e5fddb5599faeed4c56d9ea41a2c42be4998b2"},
+    {file = "pyarrow-23.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:87f06159cbe38125852657716889296c83c37b4d09a5e58f3d10245fd1f69795"},
+    {file = "pyarrow-23.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1675c374570d8b91ea6d4edd4608fa55951acd44e0c31bd146e091b4005de24f"},
+    {file = "pyarrow-23.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:247374428fde4f668f138b04031a7e7077ba5fa0b5b1722fdf89a017bf0b7ee0"},
+    {file = "pyarrow-23.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:de53b1bd3b88a2ee93c9af412c903e57e738c083be4f6392288294513cd8b2c1"},
+    {file = "pyarrow-23.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5574d541923efcbfdf1294a2746ae3b8c2498a2dc6cd477882f6f4e7b1ac08d3"},
+    {file = "pyarrow-23.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:2ef0075c2488932e9d3c2eb3482f9459c4be629aa673b725d5e3cf18f777f8e4"},
+    {file = "pyarrow-23.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:65666fc269669af1ef1c14478c52222a2aa5c907f28b68fb50a203c777e4f60c"},
+    {file = "pyarrow-23.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:4d85cb6177198f3812db4788e394b757223f60d9a9f5ad6634b3e32be1525803"},
+    {file = "pyarrow-23.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1a9ff6fa4141c24a03a1a434c63c8fa97ce70f8f36bccabc18ebba905ddf0f17"},
+    {file = "pyarrow-23.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:84839d060a54ae734eb60a756aeacb62885244aaa282f3c968f5972ecc7b1ecc"},
+    {file = "pyarrow-23.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a149a647dbfe928ce8830a713612aa0b16e22c64feac9d1761529778e4d4eaa5"},
+    {file = "pyarrow-23.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5961a9f646c232697c24f54d3419e69b4261ba8a8b66b0ac54a1851faffcbab8"},
+    {file = "pyarrow-23.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:632b3e7c3d232f41d64e1a4a043fb82d44f8a349f339a1188c6a0dd9d2d47d8a"},
+    {file = "pyarrow-23.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:76242c846db1411f1d6c2cc3823be6b86b40567ee24493344f8226ba34a81333"},
+    {file = "pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b73519f8b52ae28127000986bf228fda781e81d3095cd2d3ece76eb5cf760e1b"},
+    {file = "pyarrow-23.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:068701f6823449b1b6469120f399a1239766b117d211c5d2519d4ed5861f75de"},
+    {file = "pyarrow-23.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1801ba947015d10e23bca9dd6ef5d0e9064a81569a89b6e9a63b59224fd060df"},
+    {file = "pyarrow-23.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:52265266201ec25b6839bf6bd4ea918ca6d50f31d13e1cf200b4261cd11dc25c"},
+    {file = "pyarrow-23.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:ad96a597547af7827342ffb3c503c8316e5043bb09b47a84885ce39394c96e00"},
+    {file = "pyarrow-23.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:b9edf990df77c2901e79608f08c13fbde60202334a4fcadb15c1f57bf7afee43"},
+    {file = "pyarrow-23.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:36d1b5bc6ddcaff0083ceec7e2561ed61a51f49cce8be079ee8ed406acb6fdef"},
+    {file = "pyarrow-23.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4292b889cd224f403304ddda8b63a36e60f92911f89927ec8d98021845ea21be"},
+    {file = "pyarrow-23.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:dfd9e133e60eaa847fd80530a1b89a052f09f695d0b9c34c235ea6b2e0924cf7"},
+    {file = "pyarrow-23.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832141cc09fac6aab1cd3719951d23301396968de87080c57c9a7634e0ecd068"},
+    {file = "pyarrow-23.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:7a7d067c9a88faca655c71bcc30ee2782038d59c802d57950826a07f60d83c4c"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ce9486e0535a843cf85d990e2ec5820a47918235183a5c7b8b97ed7e92c2d47d"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:075c29aeaa685fd1182992a9ed2499c66f084ee54eea47da3eb76e125e06064c"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:799965a5379589510d888be3094c2296efd186a17ca1cef5b77703d4d5121f53"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ef7cac8fe6fccd8b9e7617bfac785b0371a7fe26af59463074e4882747145d40"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15a414f710dc927132dd67c361f78c194447479555af57317066ee5116b90e9e"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e0d2e6915eca7d786be6a77bf227fbc06d825a75b5b5fe9bcbef121dec32685"},
+    {file = "pyarrow-23.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:4b317ea6e800b5704e5e5929acb6e2dc13e9276b708ea97a39eb8b345aa2658b"},
+    {file = "pyarrow-23.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:20b187ed9550d233a872074159f765f52f9d92973191cd4b93f293a19efbe377"},
+    {file = "pyarrow-23.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:18ec84e839b493c3886b9b5e06861962ab4adfaeb79b81c76afbd8d84c7d5fda"},
+    {file = "pyarrow-23.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:e438dd3f33894e34fd02b26bd12a32d30d006f5852315f611aa4add6c7fab4bc"},
+    {file = "pyarrow-23.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:a244279f240c81f135631be91146d7fa0e9e840e1dfed2aba8483eba25cd98e6"},
+    {file = "pyarrow-23.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c4692e83e42438dba512a570c6eaa42be2f8b6c0f492aea27dec54bdc495103a"},
+    {file = "pyarrow-23.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ae7f30f898dfe44ea69654a35c93e8da4cef6606dc4c72394068fd95f8e9f54a"},
+    {file = "pyarrow-23.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:5b86bb649e4112fb0614294b7d0a175c7513738876b89655605ebb87c804f861"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:ebc017d765d71d80a3f8584ca0566b53e40464586585ac64176115baa0ada7d3"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:0800cc58a6d17d159df823f87ad66cefebf105b982493d4bad03ee7fab84b993"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3a7c68c722da9bb5b0f8c10e3eae71d9825a4b429b40b32709df5d1fa55beb3d"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:bd5556c24622df90551063ea41f559b714aa63ca953db884cfb958559087a14e"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54810f6e6afc4ffee7c2e0051b61722fbea9a4961b46192dcfae8ea12fa09059"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:14de7d48052cf4b0ed174533eafa3cfe0711b8076ad70bede32cf59f744f0d7c"},
+    {file = "pyarrow-23.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:427deac1f535830a744a4f04a6ac183a64fcac4341b3f618e693c41b7b98d2b0"},
+    {file = "pyarrow-23.0.0.tar.gz", hash = "sha256:180e3150e7edfcd182d3d9afba72f7cf19839a497cc76555a8dce998a8f67615"},
+]
+
+[[package]]
+name = "pyarrow-hotfix"
+version = "0.7"
+description = ""
+optional = false
+python-versions = ">=3.5"
+groups = ["benchmarks"]
+files = [
+    {file = "pyarrow_hotfix-0.7-py3-none-any.whl", hash = "sha256:3236f3b5f1260f0e2ac070a55c1a7b339c4bb7267839bd2015e283234e758100"},
+    {file = "pyarrow_hotfix-0.7.tar.gz", hash = "sha256:59399cd58bdd978b2e42816a4183a55c6472d4e33d183351b6069f11ed42661d"},
+]
+
 [[package]]
 name = "pyasn1"
 version = "0.6.2"
@@ -6254,7 +6468,7 @@ version = "2.9.0.post0"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-groups = ["main"]
+groups = ["main", "benchmarks"]
 files = [
    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
@@ -6315,7 +6529,7 @@ version = "2025.2"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
-groups = ["main"]
+groups = ["main", "benchmarks"]
 files = [
    {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
    {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
@@ -6358,7 +6572,7 @@ version = "6.0.3"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
@@ -6599,7 +6813,7 @@ version = "2.32.5"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6"},
    {file = "requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf"},
@@ -6893,7 +7107,7 @@ version = "1.17.0"
 description = "Python 2 and 3 compatibility utilities"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-groups = ["main"]
+groups = ["main", "benchmarks"]
 files = [
    {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"},
    {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
@@ -7739,7 +7953,7 @@ version = "4.67.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
-groups = ["main"]
+groups = ["main", "benchmarks"]
 files = [
    {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
    {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
@@ -7940,7 +8154,7 @@ version = "4.15.0"
 description = "Backported and Experimental Type Hints for Python 3.9+"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"},
    {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"},
@@ -7967,12 +8181,12 @@ version = "2025.3"
 description = "Provider of IANA time zone data"
 optional = false
 python-versions = ">=2"
-groups = ["main"]
-markers = "platform_system == \"Windows\""
+groups = ["main", "benchmarks"]
 files = [
    {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"},
    {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"},
 ]
+markers = {main = "platform_system == \"Windows\""}

 [[package]]
 name = "tzlocal"
@@ -8010,7 +8224,7 @@ version = "2.6.3"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4"},
    {file = "urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed"},
@@ -8606,13 +8820,163 @@ files = [
 [package.dependencies]
 h11 = ">=0.16.0,<1"

+[[package]]
+name = "xxhash"
+version = "3.6.0"
+description = "Python binding for xxHash"
+optional = false
+python-versions = ">=3.7"
+groups = ["benchmarks"]
+files = [
+    {file = "xxhash-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87ff03d7e35c61435976554477a7f4cd1704c3596a89a8300d5ce7fc83874a71"},
+    {file = "xxhash-3.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f572dfd3d0e2eb1a57511831cf6341242f5a9f8298a45862d085f5b93394a27d"},
+    {file = "xxhash-3.6.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:89952ea539566b9fed2bbd94e589672794b4286f342254fad28b149f9615fef8"},
+    {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e6f2ffb07a50b52465a1032c3cf1f4a5683f944acaca8a134a2f23674c2058"},
+    {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b5b848ad6c16d308c3ac7ad4ba6bede80ed5df2ba8ed382f8932df63158dd4b2"},
+    {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a034590a727b44dd8ac5914236a7b8504144447a9682586c3327e935f33ec8cc"},
+    {file = "xxhash-3.6.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a8f1972e75ebdd161d7896743122834fe87378160c20e97f8b09166213bf8cc"},
+    {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ee34327b187f002a596d7b167ebc59a1b729e963ce645964bbc050d2f1b73d07"},
+    {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:339f518c3c7a850dd033ab416ea25a692759dc7478a71131fe8869010d2b75e4"},
+    {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:bf48889c9630542d4709192578aebbd836177c9f7a4a2778a7d6340107c65f06"},
+    {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:5576b002a56207f640636056b4160a378fe36a58db73ae5c27a7ec8db35f71d4"},
+    {file = "xxhash-3.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af1f3278bd02814d6dedc5dec397993b549d6f16c19379721e5a1d31e132c49b"},
+    {file = "xxhash-3.6.0-cp310-cp310-win32.whl", hash = "sha256:aed058764db109dc9052720da65fafe84873b05eb8b07e5e653597951af57c3b"},
+    {file = "xxhash-3.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:e82da5670f2d0d98950317f82a0e4a0197150ff19a6df2ba40399c2a3b9ae5fb"},
+    {file = "xxhash-3.6.0-cp310-cp310-win_arm64.whl", hash = "sha256:4a082ffff8c6ac07707fb6b671caf7c6e020c75226c561830b73d862060f281d"},
+    {file = "xxhash-3.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b47bbd8cf2d72797f3c2772eaaac0ded3d3af26481a26d7d7d41dc2d3c46b04a"},
+    {file = "xxhash-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2b6821e94346f96db75abaa6e255706fb06ebd530899ed76d32cd99f20dc52fa"},
+    {file = "xxhash-3.6.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d0a9751f71a1a65ce3584e9cae4467651c7e70c9d31017fa57574583a4540248"},
+    {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b29ee68625ab37b04c0b40c3fafdf24d2f75ccd778333cfb698f65f6c463f62"},
+    {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6812c25fe0d6c36a46ccb002f40f27ac903bf18af9f6dd8f9669cb4d176ab18f"},
+    {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ccbff013972390b51a18ef1255ef5ac125c92dc9143b2d1909f59abc765540e"},
+    {file = "xxhash-3.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:297b7fbf86c82c550e12e8fb71968b3f033d27b874276ba3624ea868c11165a8"},
+    {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dea26ae1eb293db089798d3973a5fc928a18fdd97cc8801226fae705b02b14b0"},
+    {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7a0b169aafb98f4284f73635a8e93f0735f9cbde17bd5ec332480484241aaa77"},
+    {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:08d45aef063a4531b785cd72de4887766d01dc8f362a515693df349fdb825e0c"},
+    {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929142361a48ee07f09121fe9e96a84950e8d4df3bb298ca5d88061969f34d7b"},
+    {file = "xxhash-3.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:51312c768403d8540487dbbfb557454cfc55589bbde6424456951f7fcd4facb3"},
+    {file = "xxhash-3.6.0-cp311-cp311-win32.whl", hash = "sha256:d1927a69feddc24c987b337ce81ac15c4720955b667fe9b588e02254b80446fd"},
+    {file = "xxhash-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:26734cdc2d4ffe449b41d186bbeac416f704a482ed835d375a5c0cb02bc63fef"},
+    {file = "xxhash-3.6.0-cp311-cp311-win_arm64.whl", hash = "sha256:d72f67ef8bf36e05f5b6c65e8524f265bd61071471cd4cf1d36743ebeeeb06b7"},
+    {file = "xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c"},
+    {file = "xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204"},
+    {file = "xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490"},
+    {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2"},
+    {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa"},
+    {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0"},
+    {file = "xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2"},
+    {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bd17fede52a17a4f9a7bc4472a5867cb0b160deeb431795c0e4abe158bc784e9"},
+    {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6fb5f5476bef678f69db04f2bd1efbed3030d2aba305b0fc1773645f187d6a4e"},
+    {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374"},
+    {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d"},
+    {file = "xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae"},
+    {file = "xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb"},
+    {file = "xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c"},
+    {file = "xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829"},
+    {file = "xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:599e64ba7f67472481ceb6ee80fa3bd828fd61ba59fb11475572cc5ee52b89ec"},
+    {file = "xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d8b8aaa30fca4f16f0c84a5c8d7ddee0e25250ec2796c973775373257dde8f1"},
+    {file = "xxhash-3.6.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d597acf8506d6e7101a4a44a5e428977a51c0fadbbfd3c39650cca9253f6e5a6"},
+    {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:858dc935963a33bc33490128edc1c12b0c14d9c7ebaa4e387a7869ecc4f3e263"},
+    {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546"},
+    {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89"},
+    {file = "xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d"},
+    {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f22927652cba98c44639ffdc7aaf35828dccf679b10b31c4ad72a5b530a18eb7"},
+    {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b45fad44d9c5c119e9c6fbf2e1c656a46dc68e280275007bbfd3d572b21426db"},
+    {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42"},
+    {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11"},
+    {file = "xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd"},
+    {file = "xxhash-3.6.0-cp313-cp313-win32.whl", hash = "sha256:2577b276e060b73b73a53042ea5bd5203d3e6347ce0d09f98500f418a9fcf799"},
+    {file = "xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392"},
+    {file = "xxhash-3.6.0-cp313-cp313-win_arm64.whl", hash = "sha256:457b8f85dec5825eed7b69c11ae86834a018b8e3df5e77783c999663da2f96d6"},
+    {file = "xxhash-3.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a42e633d75cdad6d625434e3468126c73f13f7584545a9cf34e883aa1710e702"},
+    {file = "xxhash-3.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:568a6d743219e717b07b4e03b0a828ce593833e498c3b64752e0f5df6bfe84db"},
+    {file = "xxhash-3.6.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bec91b562d8012dae276af8025a55811b875baace6af510412a5e58e3121bc54"},
+    {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78e7f2f4c521c30ad5e786fdd6bae89d47a32672a80195467b5de0480aa97b1f"},
+    {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5"},
+    {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1"},
+    {file = "xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee"},
+    {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:794fe9145fe60191c6532fa95063765529770edcdd67b3d537793e8004cabbfd"},
+    {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:6105ef7e62b5ac73a837778efc331a591d8442f8ef5c7e102376506cb4ae2729"},
+    {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292"},
+    {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf"},
+    {file = "xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033"},
+    {file = "xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec"},
+    {file = "xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8"},
+    {file = "xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746"},
+    {file = "xxhash-3.6.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a40a3d35b204b7cc7643cbcf8c9976d818cb47befcfac8bbefec8038ac363f3e"},
+    {file = "xxhash-3.6.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a54844be970d3fc22630b32d515e79a90d0a3ddb2644d8d7402e3c4c8da61405"},
+    {file = "xxhash-3.6.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:016e9190af8f0a4e3741343777710e3d5717427f175adfdc3e72508f59e2a7f3"},
+    {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f6f72232f849eb9d0141e2ebe2677ece15adfd0fa599bc058aad83c714bb2c6"},
+    {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63275a8aba7865e44b1813d2177e0f5ea7eadad3dd063a21f7cf9afdc7054063"},
+    {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cd01fa2aa00d8b017c97eb46b9a794fbdca53fc14f845f5a328c71254b0abb7"},
+    {file = "xxhash-3.6.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0226aa89035b62b6a86d3c68df4d7c1f47a342b8683da2b60cedcddb46c4d95b"},
+    {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c6e193e9f56e4ca4923c61238cdaced324f0feac782544eb4c6d55ad5cc99ddd"},
+    {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9176dcaddf4ca963d4deb93866d739a343c01c969231dbe21680e13a5d1a5bf0"},
+    {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c1ce4009c97a752e682b897aa99aef84191077a9433eb237774689f14f8ec152"},
+    {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:8cb2f4f679b01513b7adbb9b1b2f0f9cdc31b70007eaf9d59d0878809f385b11"},
+    {file = "xxhash-3.6.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:653a91d7c2ab54a92c19ccf43508b6a555440b9be1bc8be553376778be7f20b5"},
+    {file = "xxhash-3.6.0-cp314-cp314-win32.whl", hash = "sha256:a756fe893389483ee8c394d06b5ab765d96e68fbbfe6fde7aa17e11f5720559f"},
+    {file = "xxhash-3.6.0-cp314-cp314-win_amd64.whl", hash = "sha256:39be8e4e142550ef69629c9cd71b88c90e9a5db703fecbcf265546d9536ca4ad"},
+    {file = "xxhash-3.6.0-cp314-cp314-win_arm64.whl", hash = "sha256:25915e6000338999236f1eb68a02a32c3275ac338628a7eaa5a269c401995679"},
+    {file = "xxhash-3.6.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c5294f596a9017ca5a3e3f8884c00b91ab2ad2933cf288f4923c3fd4346cf3d4"},
+    {file = "xxhash-3.6.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1cf9dcc4ab9cff01dfbba78544297a3a01dafd60f3bde4e2bfd016cf7e4ddc67"},
+    {file = "xxhash-3.6.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:01262da8798422d0685f7cef03b2bd3f4f46511b02830861df548d7def4402ad"},
+    {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51a73fb7cb3a3ead9f7a8b583ffd9b8038e277cdb8cb87cf890e88b3456afa0b"},
+    {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b9c6df83594f7df8f7f708ce5ebeacfc69f72c9fbaaababf6cf4758eaada0c9b"},
+    {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:627f0af069b0ea56f312fd5189001c24578868643203bca1abbc2c52d3a6f3ca"},
+    {file = "xxhash-3.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa912c62f842dfd013c5f21a642c9c10cd9f4c4e943e0af83618b4a404d9091a"},
+    {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b465afd7909db30168ab62afe40b2fcf79eedc0b89a6c0ab3123515dc0df8b99"},
+    {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a881851cf38b0a70e7c4d3ce81fc7afd86fbc2a024f4cfb2a97cf49ce04b75d3"},
+    {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9b3222c686a919a0f3253cfc12bb118b8b103506612253b5baeaac10d8027cf6"},
+    {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:c5aa639bc113e9286137cec8fadc20e9cd732b2cc385c0b7fa673b84fc1f2a93"},
+    {file = "xxhash-3.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5c1343d49ac102799905e115aee590183c3921d475356cb24b4de29a4bc56518"},
+    {file = "xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119"},
+    {file = "xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f"},
+    {file = "xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95"},
+    {file = "xxhash-3.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7dac94fad14a3d1c92affb661021e1d5cbcf3876be5f5b4d90730775ccb7ac41"},
+    {file = "xxhash-3.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6965e0e90f1f0e6cb78da568c13d4a348eeb7f40acfd6d43690a666a459458b8"},
+    {file = "xxhash-3.6.0-cp38-cp38-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2ab89a6b80f22214b43d98693c30da66af910c04f9858dd39c8e570749593d7e"},
+    {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4903530e866b7a9c1eadfd3fa2fbe1b97d3aed4739a80abf506eb9318561c850"},
+    {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4da8168ae52c01ac64c511d6f4a709479da8b7a4a1d7621ed51652f93747dffa"},
+    {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:97460eec202017f719e839a0d3551fbc0b2fcc9c6c6ffaa5af85bbd5de432788"},
+    {file = "xxhash-3.6.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:45aae0c9df92e7fa46fbb738737324a563c727990755ec1965a6a339ea10a1df"},
+    {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:0d50101e57aad86f4344ca9b32d091a2135a9d0a4396f19133426c88025b09f1"},
+    {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9085e798c163ce310d91f8aa6b325dda3c2944c93c6ce1edb314030d4167cc65"},
+    {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:a87f271a33fad0e5bf3be282be55d78df3a45ae457950deb5241998790326f87"},
+    {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:9e040d3e762f84500961791fa3709ffa4784d4dcd7690afc655c095e02fff05f"},
+    {file = "xxhash-3.6.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:b0359391c3dad6de872fefb0cf5b69d55b0655c55ee78b1bb7a568979b2ce96b"},
+    {file = "xxhash-3.6.0-cp38-cp38-win32.whl", hash = "sha256:e4ff728a2894e7f436b9e94c667b0f426b9c74b71f900cf37d5468c6b5da0536"},
+    {file = "xxhash-3.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:01be0c5b500c5362871fc9cfdf58c69b3e5c4f531a82229ddb9eb1eb14138004"},
+    {file = "xxhash-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cc604dc06027dbeb8281aeac5899c35fcfe7c77b25212833709f0bff4ce74d2a"},
+    {file = "xxhash-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:277175a73900ad43a8caeb8b99b9604f21fe8d7c842f2f9061a364a7e220ddb7"},
+    {file = "xxhash-3.6.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfbc5b91397c8c2972fdac13fb3e4ed2f7f8ccac85cd2c644887557780a9b6e2"},
+    {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2762bfff264c4e73c0e507274b40634ff465e025f0eaf050897e88ec8367575d"},
+    {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f171a900d59d51511209f7476933c34a0c2c711078d3c80e74e0fe4f38680ec"},
+    {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:780b90c313348f030b811efc37b0fa1431163cb8db8064cf88a7936b6ce5f222"},
+    {file = "xxhash-3.6.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b242455eccdfcd1fa4134c431a30737d2b4f045770f8fe84356b3469d4b919"},
+    {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a75ffc1bd5def584129774c158e108e5d768e10b75813f2b32650bb041066ed6"},
+    {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1fc1ed882d1e8df932a66e2999429ba6cc4d5172914c904ab193381fba825360"},
+    {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:44e342e8cc11b4e79dae5c57f2fb6360c3c20cc57d32049af8f567f5b4bcb5f4"},
+    {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c2f9ccd5c4be370939a2e17602fbc49995299203da72a3429db013d44d590e86"},
+    {file = "xxhash-3.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:02ea4cb627c76f48cd9fb37cf7ab22bd51e57e1b519807234b473faebe526796"},
+    {file = "xxhash-3.6.0-cp39-cp39-win32.whl", hash = "sha256:6551880383f0e6971dc23e512c9ccc986147ce7bfa1cd2e4b520b876c53e9f3d"},
+    {file = "xxhash-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:7c35c4cdc65f2a29f34425c446f2f5cdcd0e3c34158931e1cc927ece925ab802"},
+    {file = "xxhash-3.6.0-cp39-cp39-win_arm64.whl", hash = "sha256:ffc578717a347baf25be8397cb10d2528802d24f94cfc005c0e44fef44b5cdd6"},
+    {file = "xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0"},
+    {file = "xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296"},
+    {file = "xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13"},
+    {file = "xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6dc31591899f5e5666f04cc2e529e69b4072827085c1ef15294d91a004bc1bd"},
+    {file = "xxhash-3.6.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15e0dac10eb9309508bfc41f7f9deaa7755c69e35af835db9cb10751adebc35d"},
+    {file = "xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6"},
+]
+
 [[package]]
 name = "yarl"
 version = "1.22.0"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main", "benchmarks", "dev"]
 files = [
    {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c7bd6683587567e5a49ee6e336e0612bec8329be1b7d4c8af5687dcdeb67ee1e"},
    {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5cdac20da754f3a723cceea5b3448e1a2074866406adeb4ef35b469d089adb8f"},
@@ -8774,4 +9138,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "9d632f4341ef2e49aa81978032caf9605fd258aff8176263ccb1766b0a1ef4b1"
+content-hash = "f8040ae4c1cc04a87df47a0176c09cc4d9da81e2a51e3428d07d93d5c2e9724f"
--- a/classic/pyproject.toml
+++ b/classic/pyproject.toml
@@ -142,6 +142,19 @@ optional = true
 cx-freeze = { git = "https://github.com/ntindle/cx_Freeze.git", rev = "main" }


+[tool.poetry.group.benchmarks]
+optional = true
+
+[tool.poetry.group.benchmarks.dependencies]
+# External benchmark adapters
+datasets = "^2.14"
+huggingface-hub = "^0.20"
+# SWE-bench evaluation (optional - requires Docker)
+# swebench = "^2.0"  # Install separately if needed
+# Modal for cloud evaluation (optional)
+# modal = "^0.70"  # Install separately if needed
+
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"