feat(direct_benchmark): enable shell command execution with safety denylist

Enable agents to execute shell commands during benchmarks by setting
execute_local_commands=True and using denylist mode to block dangerous
commands (rm, sudo, chmod, kill, etc.) while allowing safe operations.

Also adds ExecutePython challenge to test code execution capability.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Nicholas Tindle
2026-01-20 00:52:06 -06:00
parent 572c3f5e0d
commit b849eafb7f
2 changed files with 53 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
{
"category": [
"coding"
],
"cutoff": 60,
"dependencies": [],
"eval_id": "execute-python-001",
"ground": {
"answer": "Hello, World!",
"eval": {
"type": "file"
},
"files": [
"hello.py"
],
"should_contain": [
"Hello, World!"
],
"should_not_contain": []
},
"info": {
"description": "Tests basic code execution capability",
"difficulty": "trivial",
"side_effects": []
},
"name": "ExecutePython",
"task": "Write a Python script called 'hello.py' that prints 'Hello, World!' to stdout. Then execute it using the shell to verify it works. The script should be in the workspace."
}

View File

@@ -10,6 +10,7 @@ from typing import Callable, Optional
from autogpt.agent_factory.configurators import create_agent
from autogpt.agents.agent import Agent
from autogpt.app.config import AppConfig, ConfigBuilder
from forge.file_storage import FileStorageBackendName, get_storage
from forge.llm.providers import MultiProvider
@@ -182,6 +183,30 @@ class AgentRunner:
llm_provider=llm_provider,
)
# Enable local command execution for benchmarks
# Use denylist mode to block dangerous commands while allowing flexibility
if hasattr(agent, "code_executor"):
agent.code_executor.config.execute_local_commands = True
agent.code_executor.config.shell_command_control = "denylist"
agent.code_executor.config.shell_denylist = [
"rm", # Block file removal
"sudo", # Block privilege escalation
"chmod", # Block permission changes
"chown", # Block ownership changes
"mkfs", # Block filesystem creation
"dd", # Block disk operations
"kill", # Block process killing
"pkill", # Block process killing
"killall", # Block process killing
"reboot", # Block system reboot
"shutdown", # Block system shutdown
"poweroff", # Block system poweroff
"halt", # Block system halt
"init", # Block init commands
"systemctl", # Block systemd commands
"service", # Block service commands
]
self._agent = agent
self._llm_provider = llm_provider
return agent