mini-agi, simple challenge creation, --mock flag

2026-04-08 03:00:28 -04:00 · 2023-06-27 18:17:54 -04:00
parent 36ef54340f
commit f933717d8b
19 changed files with 235 additions and 158 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,4 @@
+OPENAI_API_KEY=
+AGENT_NAME=mini-agi
+AGENT_TIMEOUT=60
+MOCK_TEST=False
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ class TestSomething(CategoryChallenge):
    """Testing if LLM can read a file"""

    @pytest.mark.parametrize(
-        "server_response",
+        "run_agent",
        [(data.task, data.mock_func)],
        indirect=True,
    )
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,12 +1,63 @@
 import os
 import glob
+import pytest
+from abc import ABC, abstractmethod
 from agbenchmark.challenges.define_task_types import Ground
+from agbenchmark.challenges.define_task_types import ChallengeData
+from dotenv import load_dotenv, set_key
+
+load_dotenv()
+
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False


-class Challenge:
+class Challenge(ABC):
    """The parent class to all specific challenges classes.
    Defines helper methods for running a challenge"""

+    @abstractmethod
+    def get_file_path(self) -> str:
+        """This should be implemented by any class which inherits from BasicChallenge"""
+        pass
+
+    @property
+    def data(self) -> ChallengeData:
+        return ChallengeData.deserialize(self.get_file_path())
+
+    @property
+    def mock(self):
+        return self.data.mock.mock_func if self.data.mock else None
+
+    @property
+    def task(self):
+        return (
+            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
+        )
+
+    @property
+    def dependencies(self) -> list:
+        print("self.data.dependencies", self.data.dependencies)
+        return self.data.dependencies
+
+    @property
+    def name(self) -> str:
+        print("self.data.name", self.data.name)
+        return self.data.name
+
+    @pytest.mark.parametrize(
+        "run_agent",
+        [(task, mock)],
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "challenge_data",
+        [data],
+        indirect=True,
+    )
+    def test_method(self, workspace):
+        raise NotImplementedError
+
    @staticmethod
    def open_file(workspace: str, filename: str):
        script_dir = os.path.abspath(workspace)
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,11 @@ import json
 import os


+class Mock(BaseModel):
+    mock_func: str
+    mock_task: Optional[str] = None
+
+
 class Info(BaseModel):
    difficulty: str
    description: str
@@ -12,17 +17,18 @@ class Info(BaseModel):

 class Ground(BaseModel):
    answer: str
-    should_contain: Optional[List[str]]
-    should_not_contain: Optional[List[str]]
+    should_contain: Optional[List[str]] = None
+    should_not_contain: Optional[List[str]] = None
    files: List[str]


 class ChallengeData(BaseModel):
+    name: str
    category: List[str]
    task: str
    dependencies: List[str]
    ground: Ground
-    mock_func: Optional[str] = None
+    mock: Optional[Mock] = None
    info: Info

    def serialize(self, path: str) -> None:
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,16 +1,20 @@
 {
+  "name": "retrieval1",
  "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Print the the capital of America to a .txt file",
  "dependencies": [],
  "ground": {
    "answer": "Washington",
    "should_contain": ["Washington"],
    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
  },
-  "mock_func": "basic_write_file_mock",
  "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
    "description": "Tests the writing to file",
    "side_effects": ["tests if there is in fact an LLM attached"]
  }
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -4,30 +4,18 @@ from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os


-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r1_data.json")
-)
-
-
 class TestRetrieval1(RetrievalChallenge):
    """The first information-retrieval challenge"""

-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    def test_retrieval(self, workspace, current_challenge_data):
-        files_contents = self.open_files(workspace, data.ground.files)
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r1_data.json")
+
+    def test_method(self, workspace):
+        files_contents = self.open_files(workspace, self.data.ground.files)

        scores = []
        for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
            print("Your score is:", score)
            scores.append(score)

--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,5 @@
 {
  "hostname": "localhost",
  "port": 8080,
-  "workspace": "agbenchmark/mocks/workspace"
+  "workspace": "C:/Users/silen/miniagi"
 }
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -4,18 +4,24 @@ import pytest
 import shutil
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
-from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
-from agbenchmark.challenges.define_task_types import ChallengeData
 import subprocess
+from agbenchmark.Challenge import Challenge
+from dotenv import load_dotenv
+
+load_dotenv()


@pytest.fixture(scope="module")
-def config():
+def config(request):
    config_file = os.path.abspath("agbenchmark/config.json")
    print(f"Config file: {config_file}")
    with open(config_file, "r") as f:
        config = json.load(f)
+
+    if request.config.getoption("--mock"):
+        config["workspace"] = "agbenchmark/mocks/workspace"
+
    return config


@@ -34,43 +40,49 @@ def workspace(config):
            print(f"Failed to delete {file_path}. Reason: {e}")


+def pytest_addoption(parser):
+    parser.addoption("--mock", action="store_true", default=False)
+
+
+AGENT_NAME = os.getenv("AGENT_NAME")
+AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT")
+
+
@pytest.fixture(autouse=True)
-def server_response(request, config):
+def run_agent(request, config):
    """Calling to get a response"""
    if isinstance(request.param, tuple):
        task = request.param[0]  # The task is passed in indirectly
-        mock_function_name = request.param[1]
+        mock_function_name = request.param[1] or None
    else:
        task = request.param
        mock_function_name = None

-    # get the current file's directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
+    if mock_function_name != None and (request.config.getoption("--mock")):
+        if mock_function_name:
+            mock_manager = MockManager(
+                task
+            )  # workspace doesn't need to be passed in, stays the same
+            print("Server unavailable, using mock", mock_function_name)
+            mock_manager.delegate(mock_function_name)
+        else:
+            print("No mock provided")
+    else:
+        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")

-    # construct the script's path
-    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
+        try:
+            timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60

-    # form the command
-    command = ["python", script_path, task]
-
-    # if mock_function_name:
-    #     mock_manager = MockManager(
-    #         task
-    #     )  # workspace doesn't need to be passed in, stays the same
-    #     print("Server unavailable, using mock", mock_function_name)
-    #     mock_manager.delegate(mock_function_name)
-    # else:
-    #     print("No mock provided")
-
-    try:
-        # run the command and wait for it to complete
-        result = subprocess.run(
-            command, shell=True, check=True, text=True, capture_output=True
-        )
-        return result
-    except subprocess.CalledProcessError as e:
-        print(f"Subprocess failed with the following error:\n{e}")
-        # If the subprocess returns a non-zero exit status
+            subprocess.run(
+                ["python", "miniagi.py", task],
+                check=True,
+                cwd=path,
+                timeout=timeout
+                # text=True,
+                # capture_output=True
+            )
+        except subprocess.TimeoutExpired:
+            print("The subprocess has exceeded the time limit and was terminated.")


 regression_json = "agbenchmark/tests/regression/regression_tests.json"
@@ -80,13 +92,13 @@ regression_manager = RegressionManager(regression_json)

 # this is to get the challenge_data from every test
@pytest.fixture(autouse=True)
-def regression_data(request):
+def challenge_data(request):
    return request.param


 def pytest_runtest_makereport(item, call):
    if call.when == "call":
-        challenge_data = item.funcargs.get("regression_data", None)
+        challenge_data = item.funcargs.get("challenge_data", None)
        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
        dependencies = challenge_data.dependencies if challenge_data else []

@@ -105,9 +117,9 @@ def pytest_runtest_makereport(item, call):

 def pytest_collection_modifyitems(items):
    """Called once all test items are collected. Used
-    to add regression marker to collected test items."""
+    to add regression and depends markers to collected test items."""
    for item in items:
-        print("pytest_collection_modifyitems", item.nodeid)
+        # regression add
        if item.nodeid.split("::")[1] in regression_manager.tests:
            print(regression_manager.tests)
            item.add_marker(pytest.mark.regression)
@@ -116,3 +128,26 @@ def pytest_collection_modifyitems(items):
 def pytest_sessionfinish():
    """Called at the end of the session to save regression tests"""
    regression_manager.save()
+
+
+# this is so that all tests can inherit from the Challenge class
+def pytest_generate_tests(metafunc):
+    if "challenge_data" in metafunc.fixturenames:
+        # Get the instance of the test class
+        test_class = metafunc.cls()
+
+        # Generate the parameters
+        params = test_class.data
+
+        # Add the parameters to the test function
+        metafunc.parametrize("challenge_data", [params], indirect=True)
+
+    if "run_agent" in metafunc.fixturenames:
+        # Get the instance of the test class
+        test_class = metafunc.cls()
+
+        # Generate the parameters
+        params = [(test_class.task, test_class.mock)]
+
+        # Add the parameters to the test function
+        metafunc.parametrize("run_agent", params, indirect=True)
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -2,6 +2,10 @@ import click
 import pytest
 import json
 import os
+from pathlib import Path
+from dotenv import load_dotenv, set_key
+
+load_dotenv()


@click.group()
@@ -12,8 +16,8 @@ def cli():
@cli.command()
@click.option("--category", default=None, help="Specific category to run")
@click.option("--noreg", is_flag=True, help="Skip regression tests")
-def start(category, noreg):
-    """Start the benchmark tests. If a category flag is is provided, run the categories with that mark."""
+@click.option("--mock", is_flag=True, help="Run with mock")
+def start(category, noreg, mock):
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
    config_file = "agbenchmark/config.json"

@@ -28,7 +32,8 @@ def start(category, noreg):
        )
        config["port"] = click.prompt("Please enter a new port", default=8080)
        config["workspace"] = click.prompt(
-            "Please enter a new workspace path", default="agbenchmark/mocks/workspace"
+            "Please enter a new workspace path",
+            default=os.path.join(Path.home(), "miniagi"),
        )

        with open(config_dir, "w") as f:
@@ -38,13 +43,17 @@ def start(category, noreg):
        with open(config_dir, "r") as f:
            config = json.load(f)

+    set_key(".env", "MOCK_TEST", "True" if mock else "False")
+    if mock:
+        config["workspace"] = "agbenchmark/mocks/workspace"
+
    # create workspace directory if it doesn't exist
    workspace_path = os.path.abspath(config["workspace"])
    if not os.path.exists(workspace_path):
        os.makedirs(workspace_path, exist_ok=True)

    regression_path = os.path.abspath(
-        "agbenchmark/tests/regression/regression_tests.txt"
+        "agbenchmark/tests/regression/regression_tests.json"
    )
    if not os.path.exists(regression_path):
        with open(regression_path, "a"):
@@ -74,6 +83,9 @@ def start(category, noreg):
        else:
            print("Running all categorys")  # run all categorys

+    if mock:
+        pytest_args.append("--mock")
+
    # Run pytest with the constructed arguments
    pytest.main(pytest_args)

--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -1,5 +1,7 @@
 import pytest
 from agbenchmark.Challenge import Challenge
+from agbenchmark.challenges.define_task_types import ChallengeData
+from abc import abstractmethod


@pytest.mark.basic
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,13 +1,16 @@
 {
+  "name": "basic_read_file",
  "category": ["basic"],
  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": ["basic_write_file"],
  "ground": {
    "answer": "random string: this is how we're doing",
    "should_contain": ["random string: this is how we're doing"],
    "files": ["file_to_check.txt"]
  },
-  "mock_func": "basic_read_file_mock",
+  "mock": {
+    "mock_func": "basic_read_file_mock"
+  },
  "info": {
    "description": "This reads the file quickly",
    "difficulty": "basic",
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -4,39 +4,30 @@ from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os

-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r_file_data.json")
-)
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup_module(workspace):
-    if data.ground.should_contain:
-        Challenge.write_to_file(
-            workspace, data.ground.files[0], "this is how we're doing"
-        )
-

 class TestReadFile(BasicChallenge):
    """Testing if LLM can read a file"""

-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    @pytest.mark.depends(on=data.dependencies)
-    def test_read_file(self, workspace):
-        files_contents = self.open_files(workspace, data.ground.files)
+    @pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
+        Challenge.write_to_file(
+            workspace, self.data.ground.files[0], "this is how we're doing"
+        )
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r_file_data.json")
+
+    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
+    def test_method(
+        self, workspace
+    ):  # run_test is a common name that all tests must implement
+        files_contents = self.open_files(workspace, self.data.ground.files)

        scores = []
        for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
            print("Your score is:", score)
            scores.append(score)

--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
+  "name": "basic_write_file",
  "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Print the the capital of America to a .txt file",
  "dependencies": [],
  "ground": {
    "answer": "Washington",
@@ -8,7 +9,10 @@
    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
    "files": [".txt"]
  },
-  "mock_func": "basic_write_file_mock",
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
+  },
  "info": {
    "difficulty": "basic",
    "description": "Tests the writing to file",
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -3,31 +3,21 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os

-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "w_file_data.json")
-)
-

 class TestWriteFile(BasicChallenge):
    """Testing if LLM can write to a file"""

-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    @pytest.mark.depends(name="test_write_file")
-    def test_write_file(self, workspace):
-        files_contents = self.open_files(workspace, data.ground.files)
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
+
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        print("my workspace is ", workspace)
+        files_contents = self.open_files(workspace, self.data.ground.files)

        scores = []
        for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
            print("Your score is:", score)
            scores.append(score)

--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "basic_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
+    }
+}
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,14 +0,0 @@
-{
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
-        "difficulty": "easy",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
--- a/agent/agbenchmark_run.py
+++ b/agent/agbenchmark_run.py
@@ -1,27 +0,0 @@
-import argparse
-import subprocess
-import os
-
-
-def main(objective):
-    # get the current directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # form the command
-    command = (
-        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
-    )
-
-    # run the command
-    subprocess.run(command, shell=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
-    parser.add_argument(
-        "objective", type=str, help="The objective to pass to miniagi.py"
-    )
-
-    args = parser.parse_args()
-
-    main(args.objective)
--- a/poetry.lock
+++ b/poetry.lock
@@ -644,6 +644,20 @@ future-fstrings = "*"
 networkx = "*"
 pytest = ">=3"

+[[package]]
+name = "python-dotenv"
+version = "1.0.0"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
+    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -814,4 +828,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
+content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
+python-dotenv = "^1.0.0"


 [build-system]
@@ -30,7 +31,7 @@ testpaths = [
 markers = [
    "retrieval",
    "regression",
-    "basic"
+    "basic",
 ]

 [tool.poetry.scripts]