mini-agi, simple challenge creation, --mock flag

This commit is contained in:
Silen Naihin
2023-06-27 18:17:54 -04:00
parent 36ef54340f
commit f933717d8b
19 changed files with 235 additions and 158 deletions

4
.env.example Normal file
View File

@@ -0,0 +1,4 @@
OPENAI_API_KEY=
AGENT_NAME=mini-agi
AGENT_TIMEOUT=60
MOCK_TEST=False

View File

@@ -65,7 +65,7 @@ class TestSomething(CategoryChallenge):
"""Testing if LLM can read a file"""
@pytest.mark.parametrize(
"server_response",
"run_agent",
[(data.task, data.mock_func)],
indirect=True,
)

View File

@@ -1,12 +1,63 @@
import os
import glob
import pytest
from abc import ABC, abstractmethod
from agbenchmark.challenges.define_task_types import Ground
from agbenchmark.challenges.define_task_types import ChallengeData
from dotenv import load_dotenv, set_key
load_dotenv()
mock_test_str = os.getenv("MOCK_TEST")
MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
class Challenge:
class Challenge(ABC):
"""The parent class to all specific challenges classes.
Defines helper methods for running a challenge"""
@abstractmethod
def get_file_path(self) -> str:
"""This should be implemented by any class which inherits from BasicChallenge"""
pass
@property
def data(self) -> ChallengeData:
return ChallengeData.deserialize(self.get_file_path())
@property
def mock(self):
return self.data.mock.mock_func if self.data.mock else None
@property
def task(self):
return (
self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
)
@property
def dependencies(self) -> list:
print("self.data.dependencies", self.data.dependencies)
return self.data.dependencies
@property
def name(self) -> str:
print("self.data.name", self.data.name)
return self.data.name
@pytest.mark.parametrize(
"run_agent",
[(task, mock)],
indirect=True,
)
@pytest.mark.parametrize(
"challenge_data",
[data],
indirect=True,
)
def test_method(self, workspace):
raise NotImplementedError
@staticmethod
def open_file(workspace: str, filename: str):
script_dir = os.path.abspath(workspace)

View File

@@ -4,6 +4,11 @@ import json
import os
class Mock(BaseModel):
mock_func: str
mock_task: Optional[str] = None
class Info(BaseModel):
difficulty: str
description: str
@@ -12,17 +17,18 @@ class Info(BaseModel):
class Ground(BaseModel):
answer: str
should_contain: Optional[List[str]]
should_not_contain: Optional[List[str]]
should_contain: Optional[List[str]] = None
should_not_contain: Optional[List[str]] = None
files: List[str]
class ChallengeData(BaseModel):
name: str
category: List[str]
task: str
dependencies: List[str]
ground: Ground
mock_func: Optional[str] = None
mock: Optional[Mock] = None
info: Info
def serialize(self, path: str) -> None:

View File

@@ -1,16 +1,20 @@
{
"name": "retrieval1",
"category": ["basic"],
"task": "What is the capital of America?",
"task": "Print the the capital of America to a .txt file",
"dependencies": [],
"ground": {
"answer": "Washington",
"should_contain": ["Washington"],
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
"files": ["file_to_check.txt"]
"files": [".txt"]
},
"mock": {
"mock_func": "basic_write_file_mock",
"mock_task": "What is the capital of America?"
},
"mock_func": "basic_write_file_mock",
"info": {
"difficulty": "easy",
"difficulty": "basic",
"description": "Tests the writing to file",
"side_effects": ["tests if there is in fact an LLM attached"]
}

View File

@@ -4,30 +4,18 @@ from agbenchmark.challenges.define_task_types import ChallengeData, Ground
import os
data = ChallengeData.deserialize(
os.path.join(os.path.dirname(__file__), "r1_data.json")
)
class TestRetrieval1(RetrievalChallenge):
"""The first information-retrieval challenge"""
@pytest.mark.parametrize(
"server_response",
[(data.task, data.mock_func)],
indirect=True,
)
@pytest.mark.parametrize(
"regression_data",
[data],
indirect=True,
)
def test_retrieval(self, workspace, current_challenge_data):
files_contents = self.open_files(workspace, data.ground.files)
def get_file_path(self) -> str: # all tests must implement this method
return os.path.join(os.path.dirname(__file__), "r1_data.json")
def test_method(self, workspace):
files_contents = self.open_files(workspace, self.data.ground.files)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, data.ground)
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)

View File

@@ -1,5 +1,5 @@
{
"hostname": "localhost",
"port": 8080,
"workspace": "agbenchmark/mocks/workspace"
"workspace": "C:/Users/silen/miniagi"
}

View File

@@ -4,18 +4,24 @@ import pytest
import shutil
from agbenchmark.tests.regression.RegressionManager import RegressionManager
import requests
from requests.exceptions import RequestException
from agbenchmark.mocks.MockManager import MockManager
from agbenchmark.challenges.define_task_types import ChallengeData
import subprocess
from agbenchmark.Challenge import Challenge
from dotenv import load_dotenv
load_dotenv()
@pytest.fixture(scope="module")
def config():
def config(request):
config_file = os.path.abspath("agbenchmark/config.json")
print(f"Config file: {config_file}")
with open(config_file, "r") as f:
config = json.load(f)
if request.config.getoption("--mock"):
config["workspace"] = "agbenchmark/mocks/workspace"
return config
@@ -34,43 +40,49 @@ def workspace(config):
print(f"Failed to delete {file_path}. Reason: {e}")
def pytest_addoption(parser):
parser.addoption("--mock", action="store_true", default=False)
AGENT_NAME = os.getenv("AGENT_NAME")
AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT")
@pytest.fixture(autouse=True)
def server_response(request, config):
def run_agent(request, config):
"""Calling to get a response"""
if isinstance(request.param, tuple):
task = request.param[0] # The task is passed in indirectly
mock_function_name = request.param[1]
mock_function_name = request.param[1] or None
else:
task = request.param
mock_function_name = None
# get the current file's directory
current_dir = os.path.dirname(os.path.abspath(__file__))
if mock_function_name != None and (request.config.getoption("--mock")):
if mock_function_name:
mock_manager = MockManager(
task
) # workspace doesn't need to be passed in, stays the same
print("Server unavailable, using mock", mock_function_name)
mock_manager.delegate(mock_function_name)
else:
print("No mock provided")
else:
path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
# construct the script's path
script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
try:
timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60
# form the command
command = ["python", script_path, task]
# if mock_function_name:
# mock_manager = MockManager(
# task
# ) # workspace doesn't need to be passed in, stays the same
# print("Server unavailable, using mock", mock_function_name)
# mock_manager.delegate(mock_function_name)
# else:
# print("No mock provided")
try:
# run the command and wait for it to complete
result = subprocess.run(
command, shell=True, check=True, text=True, capture_output=True
)
return result
except subprocess.CalledProcessError as e:
print(f"Subprocess failed with the following error:\n{e}")
# If the subprocess returns a non-zero exit status
subprocess.run(
["python", "miniagi.py", task],
check=True,
cwd=path,
timeout=timeout
# text=True,
# capture_output=True
)
except subprocess.TimeoutExpired:
print("The subprocess has exceeded the time limit and was terminated.")
regression_json = "agbenchmark/tests/regression/regression_tests.json"
@@ -80,13 +92,13 @@ regression_manager = RegressionManager(regression_json)
# this is to get the challenge_data from every test
@pytest.fixture(autouse=True)
def regression_data(request):
def challenge_data(request):
return request.param
def pytest_runtest_makereport(item, call):
if call.when == "call":
challenge_data = item.funcargs.get("regression_data", None)
challenge_data = item.funcargs.get("challenge_data", None)
difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
dependencies = challenge_data.dependencies if challenge_data else []
@@ -105,9 +117,9 @@ def pytest_runtest_makereport(item, call):
def pytest_collection_modifyitems(items):
"""Called once all test items are collected. Used
to add regression marker to collected test items."""
to add regression and depends markers to collected test items."""
for item in items:
print("pytest_collection_modifyitems", item.nodeid)
# regression add
if item.nodeid.split("::")[1] in regression_manager.tests:
print(regression_manager.tests)
item.add_marker(pytest.mark.regression)
@@ -116,3 +128,26 @@ def pytest_collection_modifyitems(items):
def pytest_sessionfinish():
"""Called at the end of the session to save regression tests"""
regression_manager.save()
# this is so that all tests can inherit from the Challenge class
def pytest_generate_tests(metafunc):
if "challenge_data" in metafunc.fixturenames:
# Get the instance of the test class
test_class = metafunc.cls()
# Generate the parameters
params = test_class.data
# Add the parameters to the test function
metafunc.parametrize("challenge_data", [params], indirect=True)
if "run_agent" in metafunc.fixturenames:
# Get the instance of the test class
test_class = metafunc.cls()
# Generate the parameters
params = [(test_class.task, test_class.mock)]
# Add the parameters to the test function
metafunc.parametrize("run_agent", params, indirect=True)

View File

@@ -2,6 +2,10 @@ import click
import pytest
import json
import os
from pathlib import Path
from dotenv import load_dotenv, set_key
load_dotenv()
@click.group()
@@ -12,8 +16,8 @@ def cli():
@cli.command()
@click.option("--category", default=None, help="Specific category to run")
@click.option("--noreg", is_flag=True, help="Skip regression tests")
def start(category, noreg):
"""Start the benchmark tests. If a category flag is is provided, run the categories with that mark."""
@click.option("--mock", is_flag=True, help="Run with mock")
def start(category, noreg, mock):
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
config_file = "agbenchmark/config.json"
@@ -28,7 +32,8 @@ def start(category, noreg):
)
config["port"] = click.prompt("Please enter a new port", default=8080)
config["workspace"] = click.prompt(
"Please enter a new workspace path", default="agbenchmark/mocks/workspace"
"Please enter a new workspace path",
default=os.path.join(Path.home(), "miniagi"),
)
with open(config_dir, "w") as f:
@@ -38,13 +43,17 @@ def start(category, noreg):
with open(config_dir, "r") as f:
config = json.load(f)
set_key(".env", "MOCK_TEST", "True" if mock else "False")
if mock:
config["workspace"] = "agbenchmark/mocks/workspace"
# create workspace directory if it doesn't exist
workspace_path = os.path.abspath(config["workspace"])
if not os.path.exists(workspace_path):
os.makedirs(workspace_path, exist_ok=True)
regression_path = os.path.abspath(
"agbenchmark/tests/regression/regression_tests.txt"
"agbenchmark/tests/regression/regression_tests.json"
)
if not os.path.exists(regression_path):
with open(regression_path, "a"):
@@ -74,6 +83,9 @@ def start(category, noreg):
else:
print("Running all categorys") # run all categorys
if mock:
pytest_args.append("--mock")
# Run pytest with the constructed arguments
pytest.main(pytest_args)

View File

@@ -1,5 +1,7 @@
import pytest
from agbenchmark.Challenge import Challenge
from agbenchmark.challenges.define_task_types import ChallengeData
from abc import abstractmethod
@pytest.mark.basic

View File

@@ -1,13 +1,16 @@
{
"name": "basic_read_file",
"category": ["basic"],
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
"dependencies": ["test_write_file"],
"dependencies": ["basic_write_file"],
"ground": {
"answer": "random string: this is how we're doing",
"should_contain": ["random string: this is how we're doing"],
"files": ["file_to_check.txt"]
},
"mock_func": "basic_read_file_mock",
"mock": {
"mock_func": "basic_read_file_mock"
},
"info": {
"description": "This reads the file quickly",
"difficulty": "basic",

View File

@@ -4,39 +4,30 @@ from agbenchmark.Challenge import Challenge
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
import os
data = ChallengeData.deserialize(
os.path.join(os.path.dirname(__file__), "r_file_data.json")
)
@pytest.fixture(scope="module", autouse=True)
def setup_module(workspace):
if data.ground.should_contain:
Challenge.write_to_file(
workspace, data.ground.files[0], "this is how we're doing"
)
class TestReadFile(BasicChallenge):
"""Testing if LLM can read a file"""
@pytest.mark.parametrize(
"server_response",
[(data.task, data.mock_func)],
indirect=True,
)
@pytest.mark.parametrize(
"regression_data",
[data],
indirect=True,
)
@pytest.mark.depends(on=data.dependencies)
def test_read_file(self, workspace):
files_contents = self.open_files(workspace, data.ground.files)
@pytest.fixture(
scope="module", autouse=True
) # this is specific to setting up a file for the test, not all tests have this
def setup_module(self, workspace):
Challenge.write_to_file(
workspace, self.data.ground.files[0], "this is how we're doing"
)
def get_file_path(self) -> str: # all tests must implement this method
return os.path.join(os.path.dirname(__file__), "r_file_data.json")
@pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
def test_method(
self, workspace
): # run_test is a common name that all tests must implement
files_contents = self.open_files(workspace, self.data.ground.files)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, data.ground)
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)

View File

@@ -1,6 +1,7 @@
{
"name": "basic_write_file",
"category": ["basic"],
"task": "What is the capital of America?",
"task": "Print the the capital of America to a .txt file",
"dependencies": [],
"ground": {
"answer": "Washington",
@@ -8,7 +9,10 @@
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
"files": [".txt"]
},
"mock_func": "basic_write_file_mock",
"mock": {
"mock_func": "basic_write_file_mock",
"mock_task": "What is the capital of America?"
},
"info": {
"difficulty": "basic",
"description": "Tests the writing to file",

View File

@@ -3,31 +3,21 @@ from agbenchmark.challenges.define_task_types import ChallengeData
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
import os
data = ChallengeData.deserialize(
os.path.join(os.path.dirname(__file__), "w_file_data.json")
)
class TestWriteFile(BasicChallenge):
"""Testing if LLM can write to a file"""
@pytest.mark.parametrize(
"server_response",
[(data.task, data.mock_func)],
indirect=True,
)
@pytest.mark.parametrize(
"regression_data",
[data],
indirect=True,
)
@pytest.mark.depends(name="test_write_file")
def test_write_file(self, workspace):
files_contents = self.open_files(workspace, data.ground.files)
def get_file_path(self) -> str: # all tests must implement this method
return os.path.join(os.path.dirname(__file__), "w_file_data.json")
@pytest.mark.depends(on=[], name="basic_write_file")
def test_method(self, workspace):
print("my workspace is ", workspace)
files_contents = self.open_files(workspace, self.data.ground.files)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, data.ground)
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)

View File

@@ -1 +1,14 @@
{}
{
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"basic_write_file"
],
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
}
}

View File

@@ -1,14 +0,0 @@
{
"agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
"difficulty": "easy",
"dependencies": [],
"test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
},
"agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
"difficulty": "basic",
"dependencies": [
"test_write_file"
],
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
}
}

View File

@@ -1,27 +0,0 @@
import argparse
import subprocess
import os
def main(objective):
# get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
# form the command
command = (
f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
)
# run the command
subprocess.run(command, shell=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
parser.add_argument(
"objective", type=str, help="The objective to pass to miniagi.py"
)
args = parser.parse_args()
main(args.objective)

16
poetry.lock generated
View File

@@ -644,6 +644,20 @@ future-fstrings = "*"
networkx = "*"
pytest = ">=3"
[[package]]
name = "python-dotenv"
version = "1.0.0"
description = "Read key-value pairs from a .env file and set them as environment variables"
optional = false
python-versions = ">=3.8"
files = [
{file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
{file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
]
[package.extras]
cli = ["click (>=5.0)"]
[[package]]
name = "requests"
version = "2.31.0"
@@ -814,4 +828,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe"

View File

@@ -15,6 +15,7 @@ requests = "^2.31.0"
openai = "^0.27.8"
pydantic = "^1.10.9"
pytest-depends = "^1.0.1"
python-dotenv = "^1.0.0"
[build-system]
@@ -30,7 +31,7 @@ testpaths = [
markers = [
"retrieval",
"regression",
"basic"
"basic",
]
[tool.poetry.scripts]