mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-14 08:45:12 -05:00
- Remove old benchmark/ folder with agbenchmark framework - Move challenges to direct_benchmark/challenges/ - Move analysis tools (analyze_reports.py, analyze_failures.py) to direct_benchmark/ - Move challenges_already_beaten.json to direct_benchmark/ - Update CI workflow to use direct_benchmark - Update CLAUDE.md files with new benchmarking instructions - Add benchmarking section to original_autogpt/CLAUDE.md The direct_benchmark harness directly instantiates agents without HTTP server overhead, enabling parallel execution with asyncio semaphore. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
34 lines
765 B
JSON
34 lines
765 B
JSON
{
|
|
"category": [
|
|
"general",
|
|
"coding",
|
|
"scrape_synthesize",
|
|
"data"
|
|
],
|
|
"cutoff": 60,
|
|
"dependencies": [],
|
|
"eval_id": "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
|
|
"ground": {
|
|
"answer": "The word 'Washington', printed to a .txt file named anything",
|
|
"eval": {
|
|
"type": "file"
|
|
},
|
|
"files": [
|
|
".txt"
|
|
],
|
|
"should_contain": [
|
|
"Washington"
|
|
],
|
|
"should_not_contain": []
|
|
},
|
|
"info": {
|
|
"description": "Tests if the agent can write a file",
|
|
"difficulty": "interface",
|
|
"side_effects": [
|
|
""
|
|
]
|
|
},
|
|
"name": "WriteFile",
|
|
"task": "Write the word 'Washington' to a .txt file"
|
|
}
|