feat(direct_benchmark): enable shell command execution with safety denylist

Enable agents to execute shell commands during benchmarks by setting execute_local_commands=True and using denylist mode to block dangerous commands (rm, sudo, chmod, kill, etc.) while allowing safe operations. Also adds ExecutePython challenge to test code execution capability. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-08 03:00:28 -04:00 · 2026-01-20 00:52:06 -06:00
parent 572c3f5e0d
commit b849eafb7f
2 changed files with 53 additions and 0 deletions
--- a/classic/direct_benchmark/challenges/verticals/code/0_execute_python/data.json
+++ b/classic/direct_benchmark/challenges/verticals/code/0_execute_python/data.json
@@ -0,0 +1,28 @@
+{
+    "category": [
+        "coding"
+    ],
+    "cutoff": 60,
+    "dependencies": [],
+    "eval_id": "execute-python-001",
+    "ground": {
+        "answer": "Hello, World!",
+        "eval": {
+            "type": "file"
+        },
+        "files": [
+            "hello.py"
+        ],
+        "should_contain": [
+            "Hello, World!"
+        ],
+        "should_not_contain": []
+    },
+    "info": {
+        "description": "Tests basic code execution capability",
+        "difficulty": "trivial",
+        "side_effects": []
+    },
+    "name": "ExecutePython",
+    "task": "Write a Python script called 'hello.py' that prints 'Hello, World!' to stdout. Then execute it using the shell to verify it works. The script should be in the workspace."
+}
--- a/classic/direct_benchmark/direct_benchmark/runner.py
+++ b/classic/direct_benchmark/direct_benchmark/runner.py
@@ -10,6 +10,7 @@ from typing import Callable, Optional
 from autogpt.agent_factory.configurators import create_agent
 from autogpt.agents.agent import Agent
 from autogpt.app.config import AppConfig, ConfigBuilder
+
 from forge.file_storage import FileStorageBackendName, get_storage
 from forge.llm.providers import MultiProvider

@@ -182,6 +183,30 @@ class AgentRunner:
            llm_provider=llm_provider,
        )

+        # Enable local command execution for benchmarks
+        # Use denylist mode to block dangerous commands while allowing flexibility
+        if hasattr(agent, "code_executor"):
+            agent.code_executor.config.execute_local_commands = True
+            agent.code_executor.config.shell_command_control = "denylist"
+            agent.code_executor.config.shell_denylist = [
+                "rm",  # Block file removal
+                "sudo",  # Block privilege escalation
+                "chmod",  # Block permission changes
+                "chown",  # Block ownership changes
+                "mkfs",  # Block filesystem creation
+                "dd",  # Block disk operations
+                "kill",  # Block process killing
+                "pkill",  # Block process killing
+                "killall",  # Block process killing
+                "reboot",  # Block system reboot
+                "shutdown",  # Block system shutdown
+                "poweroff",  # Block system poweroff
+                "halt",  # Block system halt
+                "init",  # Block init commands
+                "systemctl",  # Block systemd commands
+                "service",  # Block service commands
+            ]
+
        self._agent = agent
        self._llm_provider = llm_provider
        return agent