init agbenchmark

2026-01-09 15:17:59 -05:00 · 2023-06-18 11:14:54 -04:00
parent dfb73204bf
commit 51f2295971
41 changed files with 265 additions and 813 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -20,7 +20,6 @@ parts/
 sdist/
 var/
 wheels/
-pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
@@ -50,6 +49,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+cover/

 # Translations
 *.mo
@@ -72,6 +72,7 @@ instance/
 docs/_build/

 # PyBuilder
+.pybuilder/
 target/

 # Jupyter Notebook
@@ -82,7 +83,9 @@ profile_default/
 ipython_config.py

 # pyenv
-.python-version
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version

 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -91,7 +94,22 @@ ipython_config.py
 #   install all needed dependencies.
 #Pipfile.lock

-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/

 # Celery stuff
@@ -128,6 +146,15 @@ dmypy.json
 # Pyre type checker
 .pyre/

-/data
+# pytype static type analyzer
+.pytype/

-/.idea
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "[python]": {
+    "editor.defaultFormatter": "ms-python.black-formatter"
+  },
+  "python.formatting.provider": "none"
+}
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Toran Bruce Richards
+Copyright (c) 2023 Silen Naihin

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -1,120 +1,131 @@
-# Closing in favor of Challenges!
-Please check out challenges run in our CI pipeline: https://github.com/Significant-Gravitas/Auto-GPT/tree/master/tests/integration/challenges
+# agbenchmark

-# Auto-GPT-Benchmarks
-A set of standardised benchmarks to assess the performance of Auto-GPT.
-This currently uses the OpenAI Evals framework to run the benchmarks.
+A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work

-## Setup
+#### MVP: function calls api, api returns presigned url, folder is uploaded, write file challenge is measured, score is given

-You must add the auto_gpt_benchmarking dir to the python path
-Do this with a path file in your venv. OpenAI evals needs to import it.
+#### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x

-These instructions currently assume ubuntuy 22.04.
-They should be fairly adaptable to the windows/MacOS equivalents. Please submit a PR if you would like to see your OS
-documented.
+## Contributing

-Clone the repo with:
+- Make sure you have `poetry` installed - `pip install poetry`.
+- Then `poetry install` for dependencies

-    git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git
-    cd Auto-GPT-Benchmarks
+- To add requirements `poetry add requirement`.
+- To run in venv `poetry run python script.py`

-Create a venv with
+Feel free to merge with `main` at will (but also to ask for review) - if you can't send msg in R&D chat for access.

-    python3.10 -m venv venv
+If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit

+Let people know what beautiful code you write does, document everything well

-On MaxOS/Linux Activate it with 
+Share your progress :)

-    source venv/bin/activate
+## Api

-On Windows:
+FastAPI with REST, import requests

-    venv/scripts/activate
+```
+POST hostname:8080/challenges
+{
+   "test_name": ""
+   "challenge": "memory" - optional
+}
+```

-Install the requirements with:
+## Auth:

-    pip install -r requirements.txt
+get preSignedUrl from API

-If you haven't already clone the AutoGPT repo somewhere else on your machine.
-DO NOT CLONE IT INTO A SUBDIR OF THIS REPO.
+```
+POST preSignedUrl
+{
+   "artifacts": [{}]
+}
+```

-    cd somewhere/else
-    git clone git@github.com:Significant-Gravitas/Auto-GPT.git
-    cd Auto-GPT
-    git checkout stable # Or the branch you want to benchmark
+## Workspace

-You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at. This should becopied from the .env.template as described in the Auto-GPT README.md
+Kubernetes with AWS3 or GCP

-    Auto-GPT/.env
+## Challenges

-Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo.
+#### Dataset

-Build this with:
+Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/

-    cd Auto-GPT
-    docker build -t autogpt .
+#### Simple challenge creation through a DSL (domain specific language)

-Run your first eval with:
+```
+Challenge TicTacToeCoding
+    Description "The agent should implement a basic tic-tac-toe game in Python."
+    Artifacts {
+        Code "tictactoe.py"
+    }
+    Tasks {
+        Code "Write a function to initialize the game board."
+        Code "Write a function to handle a player's turn."
+        Code "Write a function to check for a winning move."
+        Test "Write tests for the blog post model, serializer, and view."
+        Command "Run Django's test suite to ensure everything is working as expected."
+    }
+    SuccessCriteria {
+        Correctness "The game should correctly alternate between two players."
+        Correctness "The game should correctly identify a winning move."
+        Efficiency "The game should not use unnecessary computational resources."
+        Design "The solution should follow good practices for Django and Django Rest Framework."
+    }
+EndChallenge
+```

-    cd Auto-GPT-Benchmarks
-    python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT
+#### Validators

-You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in 
+Designed to handle specific types of output (e.g., text, code, structured data)

-    auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml.
+#### Logging

-To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help`
-Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable
-to this use case.
+Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc

-This saves a file in `Auto-GPT-Benchmarks/data/records.jsonl`
-This file is currently a default that is configurable with --record_path flag. You will have to specify the fully
-qualified path.
+#### Written Challenges

-## Currently Supported Benchmarks:
-From OpenAI Evals
- [x] test-match
- [x] test-fuzzy-match
- [ ] Everything else they have...
+For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore

-## Understanding OpenAI Evals
+## Repo

-The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
+```
+|-- agbenchmark/ **main project directory**
+| |-- **init**.py
+| |-- server/
+| | |-- **init**.py
+| | |-- api.py **opens server on host and exposes urls**
+| | |-- utils.py
+| |-- benchmark/
+| | |-- **init**.py
+| | |-- benchmark.py **combining scores, metrics, final evaluation**
+| | |-- run.py **entry point. sets everything up**
+| | |-- challenges/ **challenges across different metrics**
+| | | |-- **init**.py
+| | | |-- Challenge.py **easy challenge creation through Challenge class. potentially how DSL is defined. may need to inherit challenge class like Adaptability(Challenge)**
+| | | |-- utils.py
+| | | |-- adaptability.py
+| | | |-- basic_abilities.py
+| | | |-- code.py
+| | | |-- memory.py
+| | | |-- retrieval.py
+| | | |-- web_navigation.py
+| | | |-- writing.py
+| |-- workspace/ **workspace related func**
+| | |-- **init**.py
+| | |-- workspace_manager.py **creation, deletion, preSignedUrl generation**
+| | |-- cloud_services/
+| | | |-- **init**.py
+| | | |-- aws.py **not finalized, but write, read, and del files**
+|-- tests/ **test func of agbenchmark**
+| |-- **init**.py
+| |-- test_api.py
+| |-- test_benchmark.py
+| |-- test_workspace_manager.py
+```

-The basic idea is this though:
-1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
-2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
-3. Run the evals against the completion function.
-
-Then you can make more also, yaml defined evals and run them against the completion function as needed.
-
-### Completions Functions
-
-See our yaml file in `completion_fns` dir for the registration of the completion function.
-See our completion function itself in CompletionFn.py
-That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
-
-
-# Example final output:
-
-/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl
-{"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}}
-{"final_report": {"accuracy": 0.3333333333333333}}
-{"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 1, "sample_id": "test-match.s1.2", "type": "match", "data": {"correct": false, "expected": "15", "picked": null, "sampled": "OpenAI was founded in 2015.2015", "options": ["15"]}, "created_by": "", "created_at": "2023-04-17 22:10:13.127550+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 2, "sample_id": "test-match.s1.1", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: The first US president was \nAssistant: ", "sampled": "George Washington"}, "created_by": "", "created_at": "2023-04-17 22:11:17.761693+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 3, "sample_id": "test-match.s1.1", "type": "match", "data": {"correct": true, "expected": "George Washington", "picked": "George Washington", "sampled": "George Washington", "options": ["George Washington"]}, "created_by": "", "created_at": "2023-04-17 22:11:17.761739+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 4, "sample_id": "test-match.s1.0", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: Once upon a \nAssistant: ", "sampled": "Once upon a time"}, "created_by": "", "created_at": "2023-04-17 22:12:04.691026+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"}
-(venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ 
-
-# What is next?
-
- [ ] Run the rest of the OpenAI Evals Especially the modelgraded ones
- [ ] Build longer form tasks, (code fix backed by testing)
- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
- [ ] Figure our how the OpenAI Evals results are saved...
- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't.
+Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
--- a/auto_gpt_benchmarking/init.py
+++ b/auto_gpt_benchmarking/init.py
--- a/agbenchmark/benchmark/init.py
+++ b/agbenchmark/benchmark/init.py
--- a/agbenchmark/benchmark/benchmark.py
+++ b/agbenchmark/benchmark/benchmark.py
@@ -0,0 +1 @@
+# how well the agent did on the challenges, the metrics calculation
--- a/agbenchmark/benchmark/challenges/Challenge.py
+++ b/agbenchmark/benchmark/challenges/Challenge.py
--- a/agbenchmark/benchmark/challenges/init.py
+++ b/agbenchmark/benchmark/challenges/init.py
--- a/agbenchmark/benchmark/challenges/adaptability/a1_test.py
+++ b/agbenchmark/benchmark/challenges/adaptability/a1_test.py
--- a/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py
+++ b/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py
--- a/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py
+++ b/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py
--- a/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py
+++ b/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py
--- a/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py
+++ b/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py
--- a/agbenchmark/benchmark/challenges/code/c1_test.py
+++ b/agbenchmark/benchmark/challenges/code/c1_test.py
--- a/agbenchmark/benchmark/challenges/memory/m1_test.py
+++ b/agbenchmark/benchmark/challenges/memory/m1_test.py
--- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py
+++ b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
--- a/agbenchmark/benchmark/challenges/utils.py
+++ b/agbenchmark/benchmark/challenges/utils.py
--- a/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py
+++ b/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py
--- a/agbenchmark/benchmark/challenges/writing/w1_test.py
+++ b/agbenchmark/benchmark/challenges/writing/w1_test.py
--- a/agbenchmark/benchmark/run.py
+++ b/agbenchmark/benchmark/run.py
@@ -0,0 +1 @@
+# running all of the different challenges
--- a/agbenchmark/server/init.py
+++ b/agbenchmark/server/init.py
--- a/agbenchmark/server/api.py
+++ b/agbenchmark/server/api.py
--- a/agbenchmark/server/utils.py
+++ b/agbenchmark/server/utils.py
--- a/agbenchmark/workspace/init.py
+++ b/agbenchmark/workspace/init.py
--- a/agbenchmark/workspace/cloud_services/aws.py
+++ b/agbenchmark/workspace/cloud_services/aws.py
--- a/agbenchmark/workspace/workspace_manager.py
+++ b/agbenchmark/workspace/workspace_manager.py
@@ -0,0 +1 @@
+# Manages the workspaces including creation, deletion, etc
--- a/auto_gpt_benchmarking/AutoGPTAgent.py
+++ b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -1,155 +0,0 @@
-"""
-This instantiates an AutoGPT agent who is capable of handling any task.
-It is designed to pass benchmarks as effectively as possible.
-
-Loads in the ai_settings.yaml file to get the AI's name, role, and goals.
-Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation.
-
-The model is instantiated with a prompt from the AutoGPT completion function.
-
-Eventualy we will also save and log all of the associated output and thinking for the model as well
-"""
-from pathlib import Path
-import docker
-import asyncio
-import aiodocker
-
-
-class AutoGPTAgent:
-    """
-    A class object that contains the configuration information for the AI
-    The init function takes an evaluation prompt.
-    It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo.
-    It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt
-    It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder.
-    If the model has used more than 50,000 tokens, it kills the model.
-    If the model has used less than 50,000 tokens, it returns the output.txt file.
-    """
-    def _clean_up_workspace(self):
-        """
-        Cleans up the workspace by deleting the prompt.txt and output.txt files.
-        :return:
-        """
-        # check if the files are there and delete them if they are
-        if self.prompt_file.exists():
-            self.prompt_file.unlink()
-        if self.output_file.exists():
-            self.output_file.unlink()
-        if self.file_logger.exists():
-            self.file_logger.unlink()
-
-    def _copy_ai_settings(self) -> None:
-        self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
-
-    def _copy_prompt(self) -> None:
-        self.prompt_file.write_text(self.prompt)
-
-    async def _stream_logs(self, container: aiodocker.containers.DockerContainer) -> None:
-        try:
-            async for line in container.log(stdout=True, stderr=True, follow=True, tail="all"):
-                print(line.strip())
-                await asyncio.sleep(1)
-        except aiodocker.exceptions.DockerError as e:
-            # Handle Docker errors (e.g., container is killed or removed)
-            print('Docker error: {}'.format(e))
-
-    async def _run_stream_logs(self) -> None:
-        """
-        This grabs the docker containers id and streams the logs to the console with aiodocker.
-        :return: None
-        """
-        async with aiodocker.Docker() as docker_client:
-            try:
-                container = docker_client.containers.container(self.container.id)
-                await self._stream_logs(container)
-            except aiodocker.exceptions.DockerError as e:
-                # Handle cases when the container is not found
-                print('Container not found: {}'.format(e))
-
-    def _start_agent(self):
-        """
-        This starts the agent in the docker container.
-        This assumes you have the docker image built with:
-        docker build -t autogpt .
-        In the dockerfile in the Auto-GPT repo.
-        You also must set up the .env file in the Auto-GPT repo.
-        :return:
-        """
-        client = docker.from_env()
-        env_file = self.auto_gpt_path / ".env"
-        envs = [
-            f"{line.strip()}" for line in open(
-                env_file
-            ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n" and "=" in line and not line.startswith('SMART_LLM_MODEL')]
-
-        envs.append("SMART_LLM_MODEL=gpt-3.5-turbo")
-
-        self.container = client.containers.run(
-            image="autogpt",
-            command="--continuous -C '/app/auto_gpt_workspace/ai_settings.yaml' --skip-news",
-            environment=envs,
-            volumes={
-                self.auto_workspace: {"bind": "/app/auto_gpt_workspace", "mode": "rw"},
-                f"{self.auto_gpt_path}/autogpt": {"bind": "/app/autogpt", "mode": "rw"},
-            },
-            stdin_open=True,
-            tty=True,
-            detach=True
-        )
-        asyncio.run(self._run_stream_logs())
-
-    def _poll_for_output(self):
-        """
-        This polls the output file to see if the model has finished.
-        :return:
-        """
-        while True:
-            if self.output_file.exists():
-                print("Output file exists")
-                return self.output_file.read_text()
-
-    def __init__(self, prompt, auto_gpt_path: str):
-        self.auto_gpt_path = Path(auto_gpt_path)
-        self.auto_workspace = self.auto_gpt_path / "autogpt" / "auto_gpt_workspace"
-        # if the workspace doesn't exist, create it
-        if not self.auto_workspace.exists():
-            self.auto_workspace.mkdir()
-        self.prompt_file = self.auto_workspace / "prompt.txt"
-        self.output_file = self.auto_workspace / "output.txt"
-        self.file_logger = self.auto_workspace / "file_logger.txt"
-        self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml"
-        self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml"
-        self.prompt = prompt
-        self._clean_up_workspace()
-        self._copy_ai_settings()
-        self._copy_prompt()
-        self.container = None
-        self.killing = False
-        self.logging_task = None
-
-    def start(self):
-        self._start_agent()
-        answer = self._poll_for_output()
-        print(f"Prompt was: {self.prompt}, Answer was: {answer}")
-        self.kill()
-        return answer
-
-    def kill(self):
-        if self.killing:
-            return
-        self.killing = True
-        self._clean_up_workspace()
-        if self.container:
-            # kill the container
-            try:
-                self.container.kill()
-                self.container.remove()
-            except docker.errors.APIError:
-                print('Couldn\'t find container to kill. Assuming container successfully killed itself.')
-            if self.logging_task:
-                self.logging_task.cancel()
-        self.killing = False
-
-
-
-
--- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
+++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
@@ -1,6 +0,0 @@
-ai_goals:
- Evaluate the prompt in `prompt.txt` and find the best answer in the format provided.
- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
- Save the final answer and output to the `output.txt` file, the only file you should write to, then immediately exit the program because you are done.
-ai_name: EvaluationAgent
-ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible
--- a/auto_gpt_benchmarking/CompletionFn.py
+++ b/auto_gpt_benchmarking/CompletionFn.py
@@ -1,34 +0,0 @@
-from evals.api import CompletionFn, CompletionResult
-
-from evals.prompt.base import CompletionPrompt
-from evals.record import record_sampling
-from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent
-
-
-class AutoGPTCompletionResult(CompletionResult):
-    def __init__(self, response) -> None:
-        self.response = response
-
-    def get_completions(self) -> list[str]:
-        return [self.response.strip()]
-
-
-class AutoGPTCompletionFn(CompletionFn):
-
-    def __init__(self, auto_gpt_path, **kwargs) -> None:
-        self.auto_gpt_path = auto_gpt_path
-        self.agent = None
-
-    def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult:
-        prompt = CompletionPrompt(prompt).to_formatted_prompt()
-        self.kill_agent()
-        self.agent = AutoGPTAgent(prompt, self.auto_gpt_path)
-        response = self.agent.start()
-        record_sampling(prompt=prompt, sampled=response)
-        return AutoGPTCompletionResult(response)
-
-    def kill_agent(self):
-        if self.agent:
-            self.agent.kill()
-
-
--- a/auto_gpt_benchmarking/Evaluator.py
+++ b/auto_gpt_benchmarking/Evaluator.py
@@ -1,61 +0,0 @@
-"""
-The evaluator class actually executes the evals.
-"""
-from evals.cli import oaieval
-from evals.registry import Registry
-from pathlib import Path
-from typing import List, Optional, Tuple
-import sys
-
-
-class OAIRunArgs:
-    def __init__(
-        self,
-        completion_fn: str,
-        eval: str,
-        extra_eval_params: str = "",
-        max_samples: int = None,
-        cache: bool = True,
-        visible: bool = None,
-        seed: int = 20220722,
-        user: str = "",
-        record_path: str = None,
-        log_to_file: str = None,
-        debug: bool = False,
-        local_run: bool = True,
-        dry_run: bool = False,
-        dry_run_logging: bool = True,
-    ):
-        self.completion_fn = completion_fn
-        self.eval = eval
-        self.extra_eval_params = extra_eval_params
-        self.max_samples = max_samples
-        self.cache = cache
-        self.visible = visible
-        self.seed = seed
-        self.user = user
-        self.record_path = record_path
-        self.log_to_file = log_to_file
-        self.debug = debug
-        self.local_run = local_run
-        self.dry_run = dry_run
-        self.dry_run_logging = dry_run_logging
-        # create the record and logging paths if they don't exist
-        Path(self.record_path).parent.mkdir(parents=True, exist_ok=True)
-        # Path(self.log_to_file).parent.mkdir(parents=True, exist_ok=True)
-        # Registry path should be the auto_gpt_benchmarking folder
-        self.registry_path = None
-
-
-class Evaluator:
-    def __init__(self, oai_run_args: OAIRunArgs):
-        self.oai_run_args = oai_run_args
-        registry_path = Path(__file__).parent
-
-        # add registry path to the python system path
-        sys.path.append(str(registry_path))
-        self.oai_run_args.registry_path = [registry_path]
-        # self.registry = Registry([registry_path])
-
-    def run(self):
-        oaieval.run(self.oai_run_args)
--- a/auto_gpt_benchmarking/main.py
+++ b/auto_gpt_benchmarking/main.py
@@ -1,158 +0,0 @@
-"""
-This is the main evaluation file. In it you can specify the following:
-
-1. The number of threads to use for evaluation. This is set to 1 by default.And will remain that way until we can spin
- up containers on command
-2. The timeout for each thread. This is set to 60 seconds by default. This is the amount of time each thread will run
- for before it is killed when evaluating an agent
-3. The path to the AutoGPT code. This is a required parameter as we do not know where your code lives.
-4. The evals you would like to run. The options here are any OpenAI eval, or any of the evals defined in this repository
-
-
-What this file does is it parses the params given and then runs the evals with OpenAI's evals framework.
-"""
-
-import argparse
-import os
-import sys
-from pathlib import Path
-from datetime import datetime
-import yaml
-from datetime import datetime
-
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "eval", type=str, help="Name of an eval. See registry.")
-    parser.add_argument(
-        "--completion-fn",
-        type=str,
-        dest="completion_fn",
-        default="auto_gpt_completion_fn",
-        help="One or more CompletionFn URLs, separated by commas (,). "
-             "A CompletionFn can either be the name of a model available in the OpenAI API or a key in the registry "
-             "(see evals/registry/completion_fns).",
-    )
-    parser.add_argument(
-        "--timeout",
-        type=int,
-        default=300,
-        help="The timeout for each thread",
-    )
-    parser.add_argument(
-        "--auto-gpt-path",
-        type=str,
-        default=None,
-        help="The path to the AutoGPT code. This updates auto_gpt_competion_fn.yaml in completion fns. "
-             "So you only need to set this once.",
-    )
-    parser.add_argument("--extra_eval_params", type=str, default="")
-    parser.add_argument("--max_samples", type=int, default=None)
-    parser.add_argument(
-        "--cache", action=argparse.BooleanOptionalAction, default=True)
-    parser.add_argument(
-        "--visible", action=argparse.BooleanOptionalAction, default=None)
-    parser.add_argument("--seed", type=int, default=20220722)
-    parser.add_argument("--user", type=str, default="")
-    parser.add_argument("--record_path", type=str, default=None)
-    parser.add_argument(
-        "--log_to_file", type=str, default=None,  # default=str(
-        #   Path(__file__).parent.parent / "data" / "log" / "log.txt"
-        #  ), help="Log to a file instead of stdout"
-    )
-    parser.add_argument(
-        "--debug", action=argparse.BooleanOptionalAction, default=False)
-    parser.add_argument(
-        "--local-run", action=argparse.BooleanOptionalAction, default=True)
-    parser.add_argument(
-        "--dry-run", action=argparse.BooleanOptionalAction, default=False)
-    parser.add_argument("--dry-run-logging",
-                        action=argparse.BooleanOptionalAction, default=True)
-    return parser.parse_args()
-
-
-def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) -> Path:
-    """
-    If there is a given auto_gpt_path, then we need to update the yaml file to include it in the system path
-    If we don't have one. Then we get the path from the yaml.
-    If none exists in the yaml and we don't have a path then we raise an exception.
-    :param yaml_path: The path to the yaml file
-    :param auto_gpt_path: The path to the AutoGPT code
-    :return: The path to the AutoGPT code
-    """
-    with open(yaml_path, "r") as f:
-        yaml_data = yaml.safe_load(f)
-    if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None:
-        raise Exception(
-            "You must specify a auto_gpt_path in the yaml file or pass it in as a parameter")
-    if auto_gpt_path is None:
-        auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"]
-    if auto_gpt_path is not None:
-        yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] = auto_gpt_path
-    with open(yaml_path, "w") as f:
-        yaml.safe_dump(yaml_data, f)
-
-    return Path(auto_gpt_path).absolute()
-
-
-def load_env_file(env_path: Path):
-    if not env_path.exists():
-        raise FileNotFoundError('You must set the OpenAI key in the AutoGPT env file. '
-                                'We need your api keys to start the AutoGPT agent and use OpenAI evals')
-    with open(env_path, "r") as f:
-        # find the OPENAI_API_KEY key split it from the equals sign and assign it so OpenAI evals can use it.
-        for line in f.readlines():
-            if line.startswith("OPENAI_API_KEY"):
-                os.environ["OPENAI_API_KEY"] = line.split("=")[1].strip()
-                break
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    # do not run in multiprocessing mode We do not use this right now, as it disables OpenAI's timeouts :(
-    # os.environ["EVALS_SEQUENTIAL"] = "1"
-    os.environ["EVALS_THREAD_TIMEOUT"] = str(args.timeout)
-    os.environ["EVALS_THREADS"] = str(1)
-
-    # Update the yaml file with the auto_gpt_path
-    autogpt_path = update_yaml_with_auto_gpt_path(
-        str(Path(__file__).parent / "completion_fns" /
-            "auto_gpt_completion_fn.yaml"),
-        args.auto_gpt_path
-    )
-
-    # Add the benchmarks path to the system path so we can import auto_gpt_benchmarking
-    sys.path.append(str(Path(__file__).parent.parent.absolute()))
-
-    # load all of the environment variables in the auto-gpt path/.env file
-    load_env_file(Path(autogpt_path) / ".env")
-
-    # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets
-    # set in the evaluator. We can't set it before the import because the import will fail without an API key.
-    from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs
-    if args.record_path is None:
-        args.record_path = str(Path(
-            __file__).parent.parent / "data" / f"eval-{args.eval}-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl")
-    run_args = OAIRunArgs(
-        completion_fn=args.completion_fn,
-        eval=args.eval,
-        extra_eval_params=args.extra_eval_params,
-        max_samples=args.max_samples,
-        cache=args.cache,
-        visible=args.visible,
-        seed=args.seed,
-        user=args.user,
-        record_path=args.record_path,
-        log_to_file=args.log_to_file,
-        debug=args.debug,
-        local_run=args.local_run,
-        dry_run=args.dry_run,
-        dry_run_logging=args.dry_run_logging)
-
-    # Run the evals
-    evaluator = Evaluator(
-        run_args
-    )
-    evaluator.run()
--- a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
+++ b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
@@ -1,4 +0,0 @@
-auto_gpt_completion_fn:
-  args:
-    auto_gpt_path:
-  class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn
--- a/evals_analytics.ipynb
+++ b/evals_analytics.ipynb
--- a/poetry.lock
+++ b/poetry.lock
@@ -0,0 +1,101 @@
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.1.1"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
+    {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
+[[package]]
+name = "packaging"
+version = "23.1"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
+    {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
+]
+
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "pytest"
+version = "7.3.2"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"},
+    {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.9"
+content-hash = "c5b989915c413ab901c39dd0c4f3b0fe203558c2879952a2460a52bda4f3e857"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "agbenchmark"
+version = "0.1.0"
+description = "Benchmarking the performance of agents far and wide, regardless of how they are set up and how they work"
+authors = ["Silen Naihin <silen.naihin@gmail.com>"]
+license = "MIT"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.9"
+pytest = "^7.3.2"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests", "benchmark/challenges",
+]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,81 +0,0 @@
-aiodocker==0.21.0
-aiohttp==3.8.4
-aiosignal==1.3.1
-asn1crypto==1.5.1
-async-timeout==4.0.2
-attrs==23.1.0
-backoff==2.2.1
-blobfile==2.0.1
-cachetools==5.3.0
-certifi==2022.12.7
-cffi==1.15.1
-charset-normalizer==2.1.1
-click==8.1.3
-colorama==0.4.6
-contourpy==1.0.7
-cryptography==40.0.2
-cycler==0.11.0
-dataclasses-json==0.5.7
-docker==6.0.1
-evals==1.0.2.post1
-filelock==3.11.0
-fire==0.5.0
-fonttools==4.39.3
-frozenlist==1.3.3
-gptcache==0.1.13
-greenlet==2.0.2
-idna==3.4
-importlib-resources==5.12.0
-joblib==1.2.0
-kiwisolver==1.4.4
-langchain==0.0.142
-langdetect==1.0.9
-lxml==4.9.2
-lz4==4.3.2
-marshmallow==3.19.0
-marshmallow-enum==1.5.1
-matplotlib==3.7.1
-mock==5.0.2
-multidict==6.0.4
-mypy==1.2.0
-mypy-extensions==1.0.0
-nltk==3.8.1
-numexpr==2.8.4
-numpy==1.24.2
-openai==0.27.4
-openapi-schema-pydantic==1.2.4
-oscrypto==1.3.0
-packaging==23.1
-pandas==1.5.3
-Pillow==9.5.0
-portalocker==2.7.0
-pyarrow==10.0.1
-pycparser==2.21
-pycryptodomex==3.17
-pydantic==1.10.7
-PyJWT==2.6.0
-pyOpenSSL==23.1.1
-pyparsing==3.0.9
-python-dateutil==2.8.2
-pytz==2023.3
-PyYAML==6.0
-pyzstd==0.15.6
-regex==2023.3.23
-requests==2.28.2
-sacrebleu==2.3.1
-setuptools-scm==7.1.0
-six==1.16.0
-snowflake-connector-python==3.0.2
-SQLAlchemy==1.4.47
-tabulate==0.9.0
-tenacity==8.2.2
-termcolor==2.2.0
-tiktoken==0.3.3
-tomli==2.0.1
-tqdm==4.65.0
-typing-inspect==0.8.0
-typing_extensions==4.5.0
-urllib3==1.26.15
-websocket-client==1.5.1
-yarl==1.8.2
-zipp==3.15.0
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_api.py
+++ b/tests/test_api.py
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
--- a/tests/test_workspace_manager.py
+++ b/tests/test_workspace_manager.py
				`@@ -0,0 +1 @@`
				`# how well the agent did on the challenges, the metrics calculation`
				`@@ -0,0 +1 @@`
				`# Manages the workspaces including creation, deletion, etc`