Release v0.4.1 (#4686)

Co-authored-by: Reinier van der Leer <github@pwuts.nl> Co-authored-by: Nicholas Tindle <nick@ntindle.com> Co-authored-by: Nicholas Tindle <nicktindle@outlook.com> Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com> Co-authored-by: merwanehamadi <merwanehamadi@gmail.com> Co-authored-by: Merwane Hamadi <merwanehamadi@gmail.com> Co-authored-by: Richard Beales <rich@richbeales.net> Co-authored-by: Luke K <2609441+lc0rp@users.noreply.github.com> Co-authored-by: Luke K (pr-0f3t) <2609441+lc0rp@users.noreply.github.com> Co-authored-by: Erik Peterson <e@eriklp.com> Co-authored-by: Auto-GPT-Bot <github-bot@agpt.co> Co-authored-by: Benny van der Lans <49377421+bfalans@users.noreply.github.com> Co-authored-by: Jan <jan-github@phobia.de> Co-authored-by: Robin Richtsfeld <robin.richtsfeld@gmail.com> Co-authored-by: Marc Bornträger <marc.borntraeger@gmail.com> Co-authored-by: Stefan Ayala <stefanayala3266@gmail.com> Co-authored-by: javableu <45064273+javableu@users.noreply.github.com> Co-authored-by: DGdev91 <DGdev91@users.noreply.github.com> Co-authored-by: Kinance <kinance@gmail.com> Co-authored-by: digger yu <digger-yu@outlook.com> Co-authored-by: David <scenaristeur@gmail.com> Co-authored-by: gravelBridge <john.tian31@gmail.com> Fix Python CI "update cassettes" step (#4591) fix CI (#4596) Fix inverted logic for deny_command (#4563) fix current_score.json generation (#4601) Fix duckduckgo rate limiting (#4592) Fix debug code challenge (#4632) Fix issues with information retrieval challenge a (#4622) fix issues with env configuration and .env.template (#4630) Fix prompt issue causing 'No Command' issues and challenge to fail (#4623) Fix benchmark logs (#4653) Fix typo in docs/setup.md (#4613) Fix run.sh shebang (#4561) Fix autogpt docker image not working because missing prompt_settings (#4680) Fix execute_command coming from plugins (#4730)
2026-04-30 03:00:41 -04:00 · 2023-06-19 12:41:40 -04:00
parent 25a7957bb8
commit abb397e442
142 changed files with 3185 additions and 2562 deletions
--- a/tests/challenges/init.py
+++ b/tests/challenges/init.py
--- a/tests/challenges/basic_abilities/init.py
+++ b/tests/challenges/basic_abilities/init.py
--- a/tests/challenges/basic_abilities/goal_oriented_tasks.md
+++ b/tests/challenges/basic_abilities/goal_oriented_tasks.md
@@ -0,0 +1,10 @@
+If the goal oriented task pipeline fails, it means: 
+- you somehow changed the way the system prompt is generated 
+- or you broke autogpt.
+
+To know which one, you can run the following command: 
+```bash
+pytest -s -k tests/integration/goal_oriented
+
+If the test is successful, it will record new cassettes in VCR. Then you can just push these to your branch and the pipeline
+will pass
--- a/tests/challenges/basic_abilities/test_browse_website.py
+++ b/tests/challenges/basic_abilities/test_browse_website.py
@@ -0,0 +1,25 @@
+import pytest
+
+from autogpt.agent import Agent
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import run_interaction_loop
+
+CYCLE_COUNT = 2
+
+
+@challenge()
+def test_browse_website(
+    browser_agent: Agent,
+    patched_api_requestor: None,
+    monkeypatch: pytest.MonkeyPatch,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    file_path = browser_agent.workspace.get_path("browse_website.txt")
+    run_interaction_loop(
+        monkeypatch, browser_agent, CYCLE_COUNT, challenge_name, level_to_run
+    )
+
+    # content = read_file(file_path, config)
+    content = open(file_path, encoding="utf-8").read()
+    assert "£25.89" in content, f"Expected £25.89, got {content}"
--- a/tests/challenges/basic_abilities/test_write_file.py
+++ b/tests/challenges/basic_abilities/test_write_file.py
@@ -0,0 +1,42 @@
+from typing import List
+
+import pytest
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import get_workspace_path, run_interaction_loop
+
+CYCLE_COUNT_PER_LEVEL = [1, 1]
+EXPECTED_OUTPUTS_PER_LEVEL = [
+    {"hello_world.txt": ["Hello World"]},
+    {"hello_world_1.txt": ["Hello World"], "hello_world_2.txt": ["Hello World"]},
+]
+
+
+@challenge()
+def test_write_file(
+    file_system_agents: List[Agent],
+    patched_api_requestor: None,
+    monkeypatch: pytest.MonkeyPatch,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    file_system_agent = file_system_agents[level_to_run - 1]
+    run_interaction_loop(
+        monkeypatch,
+        file_system_agent,
+        CYCLE_COUNT_PER_LEVEL[level_to_run - 1],
+        challenge_name,
+        level_to_run,
+    )
+
+    expected_outputs = EXPECTED_OUTPUTS_PER_LEVEL[level_to_run - 1]
+
+    for file_name, expected_lines in expected_outputs.items():
+        file_path = get_workspace_path(file_system_agent, file_name)
+        content = read_file(file_path, file_system_agent)
+        for expected_line in expected_lines:
+            assert (
+                expected_line in content
+            ), f"Expected '{expected_line}' in file {file_name}, but it was not found"
--- a/tests/challenges/challenge_decorator/init.py
+++ b/tests/challenges/challenge_decorator/init.py
--- a/tests/challenges/challenge_decorator/challenge.py
+++ b/tests/challenges/challenge_decorator/challenge.py
@@ -0,0 +1,24 @@
+from typing import Optional
+
+
+class Challenge:
+    BEAT_CHALLENGES = False
+    DEFAULT_CHALLENGE_NAME = "default_challenge_name"
+
+    def __init__(
+        self,
+        name: str,
+        category: str,
+        max_level: int,
+        is_new_challenge: bool,
+        max_level_beaten: Optional[int] = None,
+        level_to_run: Optional[int] = None,
+    ) -> None:
+        self.name = name
+        self.category = category
+        self.max_level_beaten = max_level_beaten
+        self.max_level = max_level
+        self.succeeded = False
+        self.skipped = False
+        self.level_to_run = level_to_run
+        self.is_new_challenge = is_new_challenge
--- a/tests/challenges/challenge_decorator/challenge_decorator.py
+++ b/tests/challenges/challenge_decorator/challenge_decorator.py
@@ -0,0 +1,89 @@
+import os
+from functools import wraps
+from typing import Any, Callable, Optional
+
+import pytest
+from flaky import flaky  # type: ignore
+
+from tests.challenges.challenge_decorator.challenge import Challenge
+from tests.challenges.challenge_decorator.challenge_utils import create_challenge
+from tests.challenges.challenge_decorator.score_utils import (
+    get_scores,
+    update_new_score,
+)
+from tests.utils import requires_api_key
+
+MAX_LEVEL_TO_IMPROVE_ON = (
+    1  # we will attempt to beat 1 level above the current level for now.
+)
+
+CHALLENGE_FAILED_MESSAGE = "Challenges can sometimes fail randomly, please run this test again and if it fails reach out to us on https://discord.gg/autogpt in the 'challenges' channel to let us know the challenge you're struggling with."
+
+
+def challenge(
+    max_runs: int = 2, min_passes: int = 1, api_key: str = "OPENAI_API_KEY"
+) -> Callable[[Callable[..., Any]], Callable[..., None]]:
+    def decorator(func: Callable[..., Any]) -> Callable[..., None]:
+        @requires_api_key(api_key)
+        @pytest.mark.vcr
+        @flaky(max_runs=max_runs, min_passes=min_passes)
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> None:
+            run_remaining = MAX_LEVEL_TO_IMPROVE_ON if Challenge.BEAT_CHALLENGES else 1
+            original_error: Optional[Exception] = None
+
+            while run_remaining > 0:
+                current_score, new_score, new_score_location = get_scores()
+                level_to_run = (
+                    kwargs["level_to_run"] if "level_to_run" in kwargs else None
+                )
+                challenge = create_challenge(
+                    func, current_score, Challenge.BEAT_CHALLENGES, level_to_run
+                )
+                if challenge.level_to_run is not None:
+                    kwargs["level_to_run"] = challenge.level_to_run
+                    kwargs["challenge_name"] = challenge.name
+                    try:
+                        func(*args, **kwargs)
+                        challenge.succeeded = True
+                    except AssertionError as err:
+                        original_error = AssertionError(
+                            f"{CHALLENGE_FAILED_MESSAGE}\n{err}"
+                        )
+                        challenge.succeeded = False
+                    except Exception as err:
+                        original_error = err
+                        challenge.succeeded = False
+                else:
+                    challenge.skipped = True
+                if os.environ.get("CI") == "true":
+                    new_max_level_beaten = get_new_max_level_beaten(
+                        challenge, Challenge.BEAT_CHALLENGES
+                    )
+                    update_new_score(
+                        new_score_location, new_score, challenge, new_max_level_beaten
+                    )
+                if challenge.level_to_run is None:
+                    pytest.skip("This test has not been unlocked yet.")
+
+                if not challenge.succeeded:
+                    if Challenge.BEAT_CHALLENGES or challenge.is_new_challenge:
+                        pytest.xfail(str(original_error))
+                    if original_error:
+                        raise original_error
+                run_remaining -= 1
+
+        return wrapper
+
+    return decorator
+
+
+def get_new_max_level_beaten(
+    challenge: Challenge, beat_challenges: bool
+) -> Optional[int]:
+    if challenge.succeeded:
+        return challenge.level_to_run
+    if challenge.skipped:
+        return challenge.max_level_beaten
+    # Challenge failed
+    return challenge.max_level_beaten if beat_challenges else None
--- a/tests/challenges/challenge_decorator/challenge_utils.py
+++ b/tests/challenges/challenge_decorator/challenge_utils.py
@@ -0,0 +1,85 @@
+import os
+from typing import Any, Callable, Dict, Optional, Tuple
+
+from tests.challenges.challenge_decorator.challenge import Challenge
+
+CHALLENGE_PREFIX = "test_"
+
+
+def create_challenge(
+    func: Callable[..., Any],
+    current_score: Dict[str, Any],
+    is_beat_challenges: bool,
+    level_to_run: Optional[int] = None,
+) -> Challenge:
+    challenge_category, challenge_name = get_challenge_identifiers(func)
+    is_new_challenge = challenge_name not in current_score.get(challenge_category, {})
+    max_level = get_max_level(current_score, challenge_category, challenge_name)
+    max_level_beaten = get_max_level_beaten(
+        current_score, challenge_category, challenge_name
+    )
+    level_to_run = get_level_to_run(
+        is_beat_challenges, level_to_run, max_level, max_level_beaten, is_new_challenge
+    )
+
+    return Challenge(
+        name=challenge_name,
+        category=challenge_category,
+        max_level=max_level,
+        max_level_beaten=max_level_beaten,
+        level_to_run=level_to_run,
+        is_new_challenge=is_new_challenge,
+    )
+
+
+def get_level_to_run(
+    is_beat_challenges: bool,
+    level_to_run: Optional[int],
+    max_level: int,
+    max_level_beaten: Optional[int],
+    is_new_challenge: bool,
+) -> Optional[int]:
+    if is_new_challenge:
+        return 1
+    if level_to_run is not None:
+        if level_to_run > max_level:
+            raise ValueError(
+                f"Level to run ({level_to_run}) is greater than max level ({max_level})"
+            )
+        return level_to_run
+    if is_beat_challenges:
+        if max_level_beaten == max_level:
+            return None
+        return 1 if max_level_beaten is None else max_level_beaten + 1
+    return max_level_beaten
+
+
+def get_challenge_identifiers(func: Callable[..., Any]) -> Tuple[str, str]:
+    full_path = os.path.dirname(os.path.abspath(func.__code__.co_filename))
+    challenge_category = os.path.basename(full_path)
+    challenge_name = func.__name__.replace(CHALLENGE_PREFIX, "")
+    return challenge_category, challenge_name
+
+
+def get_max_level(
+    current_score: Dict[str, Any],
+    challenge_category: str,
+    challenge_name: str,
+) -> int:
+    return (
+        current_score.get(challenge_category, {})
+        .get(challenge_name, {})
+        .get("max_level", 1)
+    )
+
+
+def get_max_level_beaten(
+    current_score: Dict[str, Any],
+    challenge_category: str,
+    challenge_name: str,
+) -> Optional[int]:
+    return (
+        current_score.get(challenge_category, {})
+        .get(challenge_name, {})
+        .get("max_level_beaten", None)
+    )
--- a/tests/challenges/challenge_decorator/score_utils.py
+++ b/tests/challenges/challenge_decorator/score_utils.py
@@ -0,0 +1,59 @@
+import json
+import os
+from typing import Any, Dict, Optional, Tuple
+
+from tests.challenges.challenge_decorator.challenge import Challenge
+
+CURRENT_SCORE_LOCATION = "../current_score"
+NEW_SCORE_LOCATION = "../new_score"
+
+
+def update_new_score(
+    filename_new_score: str,
+    new_score: Dict[str, Any],
+    challenge: Challenge,
+    new_max_level_beaten: Optional[int],
+) -> None:
+    write_new_score(new_score, challenge, new_max_level_beaten)
+    write_new_score_to_file(new_score, filename_new_score)
+
+
+def write_new_score(
+    new_score: Dict[str, Any], challenge: Challenge, new_max_level_beaten: Optional[int]
+) -> Dict[str, Any]:
+    new_score.setdefault(challenge.category, {})
+    new_score[challenge.category][challenge.name] = {
+        "max_level_beaten": new_max_level_beaten,
+        "max_level": challenge.max_level,
+    }
+    return new_score
+
+
+def write_new_score_to_file(new_score: Dict[str, Any], filename: str) -> None:
+    with open(filename, "w") as file:
+        json.dump(new_score, file, indent=4)
+
+
+def get_scores() -> Tuple[Dict[str, Any], Dict[str, Any], str]:
+    filename_current_score, filename_new_score = get_score_locations()
+    current_score = load_json(filename_current_score)
+    new_score = load_json(filename_new_score)
+    return current_score, new_score, filename_new_score
+
+
+def load_json(filename: str) -> Dict[str, Any]:
+    if os.path.isfile(filename):
+        with open(filename, "r") as file:
+            return json.load(file)
+    else:
+        return {}
+
+
+def get_score_locations() -> Tuple[str, str]:
+    pid = os.getpid()
+    project_root = os.path.dirname(os.path.abspath(__file__))
+    filename_current_score = os.path.join(
+        project_root, f"{CURRENT_SCORE_LOCATION}.json"
+    )
+    filename_new_score = os.path.join(project_root, f"{NEW_SCORE_LOCATION}_{pid}.json")
+    return filename_current_score, filename_new_score
--- a/tests/challenges/conftest.py
+++ b/tests/challenges/conftest.py
@@ -0,0 +1,61 @@
+from typing import Any, Dict, Optional
+
+import pytest
+from _pytest.config import Config
+from _pytest.config.argparsing import Parser
+from _pytest.fixtures import FixtureRequest
+
+from tests.challenges.challenge_decorator.challenge import Challenge
+from tests.vcr import before_record_response
+
+
+def before_record_response_filter_errors(
+    response: Dict[str, Any]
+) -> Optional[Dict[str, Any]]:
+    """In challenges we don't want to record errors (See issue #4461)"""
+    if response["status"]["code"] >= 400:
+        return None
+
+    return before_record_response(response)
+
+
+@pytest.fixture(scope="module")
+def vcr_config(get_base_vcr_config: Dict[str, Any]) -> Dict[str, Any]:
+    # this fixture is called by the pytest-recording vcr decorator.
+    return get_base_vcr_config | {
+        "before_record_response": before_record_response_filter_errors,
+    }
+
+
+def pytest_addoption(parser: Parser) -> None:
+    parser.addoption(
+        "--level", action="store", default=None, type=int, help="Specify test level"
+    )
+    parser.addoption(
+        "--beat-challenges",
+        action="store_true",
+        help="Spepcifies whether the test suite should attempt to beat challenges",
+    )
+
+
+def pytest_configure(config: Config) -> None:
+    level = config.getoption("--level", default=None)
+    config.option.level = level
+    beat_challenges = config.getoption("--beat-challenges", default=False)
+    config.option.beat_challenges = beat_challenges
+
+
+@pytest.fixture
+def level_to_run(request: FixtureRequest) -> int:
+    ## used for challenges in the goal oriented tests
+    return request.config.option.level
+
+
+@pytest.fixture
+def challenge_name() -> str:
+    return Challenge.DEFAULT_CHALLENGE_NAME
+
+
+@pytest.fixture(autouse=True)
+def check_beat_challenges(request: FixtureRequest) -> None:
+    Challenge.BEAT_CHALLENGES = request.config.getoption("--beat-challenges")
--- a/tests/challenges/current_score.json
+++ b/tests/challenges/current_score.json
@@ -0,0 +1,52 @@
+{
+    "basic_abilities": {
+        "browse_website": {
+            "max_level": 1,
+            "max_level_beaten": 1
+        },
+        "write_file": {
+            "max_level": 2,
+            "max_level_beaten": 1
+        }
+    },
+    "debug_code": {
+        "debug_code_challenge_a": {
+            "max_level": 2,
+            "max_level_beaten": 1
+        }
+    },
+    "information_retrieval": {
+        "information_retrieval_challenge_a": {
+            "max_level": 3,
+            "max_level_beaten": null
+        },
+        "information_retrieval_challenge_b": {
+            "max_level": 1,
+            "max_level_beaten": null
+        }
+    },
+    "kubernetes": {
+        "kubernetes_template_challenge_a": {
+            "max_level": 1,
+            "max_level_beaten": null
+        }
+    },
+    "memory": {
+        "memory_challenge_a": {
+            "max_level": 3,
+            "max_level_beaten": 3
+        },
+        "memory_challenge_b": {
+            "max_level": 5,
+            "max_level_beaten": null
+        },
+        "memory_challenge_c": {
+            "max_level": 5,
+            "max_level_beaten": null
+        },
+        "memory_challenge_d": {
+            "max_level": 5,
+            "max_level_beaten": null
+        }
+    }
+}
--- a/tests/challenges/debug_code/data/code.py
+++ b/tests/challenges/debug_code/data/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/tests/challenges/debug_code/data/test.py
+++ b/tests/challenges/debug_code/data/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/tests/challenges/debug_code/test_debug_code_challenge_a.py
+++ b/tests/challenges/debug_code/test_debug_code_challenge_a.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.execute_code import execute_python_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import (
+    copy_file_into_workspace,
+    get_workspace_path,
+    run_interaction_loop,
+)
+
+CYCLE_COUNT = 5
+EXPECTED_VALUES = ["[0, 1]", "[2, 5]", "[0, 3]"]
+DIRECTORY_PATH = Path(__file__).parent / "data"
+CODE_FILE_PATH = "code.py"
+TEST_FILE_PATH = "test.py"
+
+
+@challenge()
+def test_debug_code_challenge_a(
+    debug_code_agents: Agent,
+    monkeypatch: pytest.MonkeyPatch,
+    patched_api_requestor: MockerFixture,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    Test whether the agent can debug a simple code snippet.
+
+    :param debug_code_agent: The agent to test.
+    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
+    :patched_api_requestor: Sends api requests to our API CI pipeline
+    :level_to_run: The level to run.
+    """
+    debug_code_agent = debug_code_agents[level_to_run - 1]
+
+    copy_file_into_workspace(debug_code_agent, DIRECTORY_PATH, CODE_FILE_PATH)
+    copy_file_into_workspace(debug_code_agent, DIRECTORY_PATH, TEST_FILE_PATH)
+
+    run_interaction_loop(
+        monkeypatch, debug_code_agent, CYCLE_COUNT, challenge_name, level_to_run
+    )
+
+    output = execute_python_file(
+        get_workspace_path(debug_code_agent, TEST_FILE_PATH), debug_code_agent
+    )
+
+    assert "error" not in output.lower(), f"Errors found in output: {output}!"
+
+    for expected_value in EXPECTED_VALUES:
+        assert (
+            expected_value in output
+        ), f"Expected output to contain {expected_value}, but it was not found in {output}!"
--- a/tests/challenges/information_retrieval/test_information_retrieval_challenge_a.py
+++ b/tests/challenges/information_retrieval/test_information_retrieval_challenge_a.py
@@ -0,0 +1,44 @@
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.commands.file_operations import read_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import get_workspace_path, run_interaction_loop
+
+CYCLE_COUNT = 3
+EXPECTED_REVENUES = [["81"], ["81"], ["81", "53", "24", "21", "11", "7", "4", "3", "2"]]
+from autogpt.agent import Agent
+
+OUTPUT_LOCATION = "output.txt"
+
+
+@challenge()
+def test_information_retrieval_challenge_a(
+    information_retrieval_agents: Agent,
+    monkeypatch: pytest.MonkeyPatch,
+    patched_api_requestor: MockerFixture,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    Test the challenge_a function in a given agent by mocking user inputs and checking the output file content.
+
+    :param get_company_revenue_agent: The agent to test.
+    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
+    """
+    information_retrieval_agent = information_retrieval_agents[level_to_run - 1]
+    run_interaction_loop(
+        monkeypatch,
+        information_retrieval_agent,
+        CYCLE_COUNT,
+        challenge_name,
+        level_to_run,
+    )
+
+    file_path = get_workspace_path(information_retrieval_agent, OUTPUT_LOCATION)
+    content = read_file(file_path, information_retrieval_agent)
+    expected_revenues = EXPECTED_REVENUES[level_to_run - 1]
+    for revenue in expected_revenues:
+        assert (
+            f"{revenue}." in content or f"{revenue}," in content
+        ), f"Expected the file to contain {revenue}"
--- a/tests/challenges/information_retrieval/test_information_retrieval_challenge_b.py
+++ b/tests/challenges/information_retrieval/test_information_retrieval_challenge_b.py
@@ -0,0 +1,50 @@
+import contextlib
+
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import get_workspace_path, run_interaction_loop
+
+CYCLE_COUNT = 3
+OUTPUT_LOCATION = "2010_nobel_prize_winners.txt"
+
+
+@challenge()
+def test_information_retrieval_challenge_b(
+    get_nobel_prize_agent: Agent,
+    monkeypatch: pytest.MonkeyPatch,
+    patched_api_requestor: MockerFixture,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    Test the challenge_b function in a given agent by mocking user inputs and checking the output file content.
+
+    :param get_nobel_prize_agent: The agent to test.
+    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
+    :param patched_api_requestor: APIRequestor Patch to override the openai.api_requestor module for testing.
+    :param level_to_run: The level to run.
+    """
+
+    with contextlib.suppress(SystemExit):
+        run_interaction_loop(
+            monkeypatch,
+            get_nobel_prize_agent,
+            CYCLE_COUNT,
+            challenge_name,
+            level_to_run,
+        )
+    file_path = get_workspace_path(get_nobel_prize_agent, OUTPUT_LOCATION)
+
+    content = read_file(file_path, get_nobel_prize_agent)
+    assert "Andre Geim" in content, "Expected the file to contain Andre Geim"
+    assert (
+        "Konstantin Novoselov" in content
+    ), "Expected the file to contain Konstantin Novoselov"
+    assert (
+        "University of Manchester" in content
+    ), "Expected the file to contain University of Manchester"
+    assert "graphene" in content, "Expected the file to contain graphene"
--- a/tests/challenges/kubernetes/test_kubernetes_template_challenge_a.py
+++ b/tests/challenges/kubernetes/test_kubernetes_template_challenge_a.py
@@ -0,0 +1,43 @@
+import pytest
+import yaml
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import get_workspace_path, run_interaction_loop
+
+CYCLE_COUNT = 3
+OUTPUT_LOCATION = "kube.yaml"
+
+
+@challenge()
+def test_kubernetes_template_challenge_a(
+    kubernetes_agent: Agent,
+    monkeypatch: pytest.MonkeyPatch,
+    patched_api_requestor: MockerFixture,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    Test the challenge_a function in a given agent by mocking user inputs
+    and checking the output file content.
+
+    Args:
+        kubernetes_agent (Agent)
+        monkeypatch (pytest.MonkeyPatch)
+        level_to_run (int)
+    """
+    run_interaction_loop(
+        monkeypatch, kubernetes_agent, CYCLE_COUNT, challenge_name, level_to_run
+    )
+
+    file_path = get_workspace_path(kubernetes_agent, OUTPUT_LOCATION)
+    content = read_file(file_path, kubernetes_agent)
+
+    for word in ["apiVersion", "kind", "metadata", "spec"]:
+        assert word in content, f"Expected the file to contain {word}"
+
+    content = yaml.safe_load(content)
+    for word in ["Service", "Deployment", "Pod"]:
+        assert word in content["kind"], f"Expected the file to contain {word}"
--- a/tests/challenges/memory/init.py
+++ b/tests/challenges/memory/init.py
--- a/tests/challenges/memory/test_memory_challenge_a.py
+++ b/tests/challenges/memory/test_memory_challenge_a.py
@@ -0,0 +1,84 @@
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file, write_to_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import get_workspace_path, run_interaction_loop
+
+OUTPUT_LOCATION = "output.txt"
+
+
+@challenge()
+def test_memory_challenge_a(
+    memory_management_agent: Agent,
+    patched_api_requestor: MockerFixture,
+    monkeypatch: pytest.MonkeyPatch,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    The agent reads a file containing a task_id. Then, it reads a series of other files.
+    After reading 'n' files, the agent must write the task_id into a new file.
+    Args:
+        memory_management_agent (Agent)
+        patched_api_requestor (MockerFixture)
+        monkeypatch (pytest.MonkeyPatch)
+        level_to_run (int)
+    """
+    task_id = "2314"
+    create_instructions_files(memory_management_agent, level_to_run, task_id)
+
+    run_interaction_loop(
+        monkeypatch,
+        memory_management_agent,
+        level_to_run + 2,
+        challenge_name,
+        level_to_run,
+    )
+
+    file_path = get_workspace_path(memory_management_agent, OUTPUT_LOCATION)
+    content = read_file(file_path, memory_management_agent)
+    assert task_id in content, f"Expected the file to contain {task_id}"
+
+
+def create_instructions_files(
+    memory_management_agent: Agent,
+    num_files: int,
+    task_id: str,
+    base_filename: str = "instructions_",
+) -> None:
+    """
+    Creates a series of instructions files for the memory challenge.
+    Args:
+        memory_management_agent (Agent)
+        num_files (int)
+        task_id (str)
+        base_filename (str, optional)
+    """
+    for i in range(1, num_files + 1):
+        content = generate_content(i, task_id, base_filename, num_files)
+        file_name = f"{base_filename}{i}.txt"
+        file_path = get_workspace_path(memory_management_agent, file_name)
+        write_to_file(file_path, content, memory_management_agent)
+
+
+def generate_content(
+    index: int, task_id: str, base_filename: str, num_files: int
+) -> str:
+    """
+    Args:
+        index: int
+        task_id: str
+        base_filename: str
+        num_files: int
+
+    Returns: str
+    """
+    if index == 1:
+        return (
+            f"This task_id is {task_id}\nRead the file {base_filename}{index + 1}.txt"
+        )
+    if index != num_files:
+        return f"Read the file {base_filename}{index + 1}.txt"
+    return "Write the task_id into the file output.txt\nShutdown"
--- a/tests/challenges/memory/test_memory_challenge_b.py
+++ b/tests/challenges/memory/test_memory_challenge_b.py
@@ -0,0 +1,91 @@
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file, write_to_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import (
+    generate_noise,
+    get_workspace_path,
+    run_interaction_loop,
+)
+
+NOISE = 1000
+OUTPUT_LOCATION = "output.txt"
+
+
+@challenge()
+def test_memory_challenge_b(
+    memory_management_agent: Agent,
+    patched_api_requestor: MockerFixture,
+    monkeypatch: pytest.MonkeyPatch,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    The agent reads a series of files, each containing a task_id and noise. After reading 'n' files,
+    the agent must write all the task_ids into a new file, filtering out the noise.
+
+    Args:
+        memory_management_agent (Agent)
+        patched_api_requestor (MockerFixture)
+        monkeypatch (pytest.MonkeyPatch)
+        level_to_run (int)
+    """
+    task_ids = [str(i * 1111) for i in range(1, level_to_run + 1)]
+    create_instructions_files(memory_management_agent, level_to_run, task_ids)
+
+    run_interaction_loop(
+        monkeypatch,
+        memory_management_agent,
+        level_to_run + 2,
+        challenge_name,
+        level_to_run,
+    )
+
+    file_path = get_workspace_path(memory_management_agent, OUTPUT_LOCATION)
+    content = read_file(file_path, memory_management_agent)
+    for task_id in task_ids:
+        assert task_id in content, f"Expected the file to contain {task_id}"
+
+
+def create_instructions_files(
+    memory_management_agent: Agent,
+    level: int,
+    task_ids: list,
+    base_filename: str = "instructions_",
+) -> None:
+    """
+    Creates a series of instructions files for the memory challenge.
+    Args:
+        level:
+        memory_management_agent (Agent)
+        num_files (int)
+        task_ids (list)
+        base_filename (str, optional)
+    """
+    for i in range(1, level + 1):
+        content = generate_content(i, task_ids, base_filename, level)
+        file_name = f"{base_filename}{i}.txt"
+        file_path = get_workspace_path(memory_management_agent, file_name)
+
+        write_to_file(file_path, content, memory_management_agent)
+
+
+def generate_content(index: int, task_ids: list, base_filename: str, level: int) -> str:
+    """
+    Args:
+        index: int
+        task_ids: list
+        base_filename: str
+        num_files: int
+
+    Returns: str
+    """
+    task_id = task_ids[index - 1]
+    noise = generate_noise(NOISE)
+    if index != level:
+        if level == 1:
+            return f"{noise}\nThe current task_id is {task_id}.\n{noise}\nWrite all the task_ids into the file output.txt. The file has not been created yet. After that, use the task_complete command."
+        return f"{noise}\nThe current task_id is {task_id}.\n{noise}\nRead the file {base_filename}{index + 1}.txt using the read_file command."
+    return f"{noise}\nThis task_id is {task_id}\n{noise}\nWrite all the task_ids into the file output.txt. The file has not been created yet. After that, use the task_complete command.\n"
--- a/tests/challenges/memory/test_memory_challenge_c.py
+++ b/tests/challenges/memory/test_memory_challenge_c.py
@@ -0,0 +1,109 @@
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file, write_to_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import (
+    generate_noise,
+    get_workspace_path,
+    run_interaction_loop,
+)
+
+NOISE = 1200
+OUTPUT_LOCATION = "output.txt"
+
+
+@challenge()
+def test_memory_challenge_c(
+    memory_management_agent: Agent,
+    patched_api_requestor: MockerFixture,
+    monkeypatch: pytest.MonkeyPatch,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    Instead of reading task Ids from files as with the previous challenges, the agent now must remember
+    phrases which may have semantically similar meaning and the agent must write the phrases to a file
+    after seeing several of them.
+
+    Args:
+        memory_management_agent (Agent)
+        patched_api_requestor (MockerFixture)
+        monkeypatch (pytest.MonkeyPatch)
+        level_to_run (int)
+    """
+    silly_phrases = [
+        "The purple elephant danced on a rainbow while eating a taco",
+        "The sneaky toaster stole my socks and ran away to Hawaii",
+        "My pet rock sings better than Beyoncé on Tuesdays",
+        "The giant hamster rode a unicycle through the crowded mall",
+        "The talking tree gave me a high-five and then flew away",
+        "I have a collection of invisible hats that I wear on special occasions",
+        "The flying spaghetti monster stole my sandwich and left a note saying 'thanks for the snack'",
+        "My imaginary friend is a dragon who loves to play video games",
+        "I once saw a cloud shaped like a giant chicken eating a pizza",
+        "The ninja unicorn disguised itself as a potted plant and infiltrated the office",
+    ]
+
+    level_silly_phrases = silly_phrases[:level_to_run]
+    create_instructions_files(
+        memory_management_agent,
+        level_to_run,
+        level_silly_phrases,
+    )
+
+    run_interaction_loop(
+        monkeypatch,
+        memory_management_agent,
+        level_to_run + 2,
+        challenge_name,
+        level_to_run,
+    )
+    file_path = get_workspace_path(memory_management_agent, OUTPUT_LOCATION)
+    content = read_file(file_path, agent=memory_management_agent)
+    for phrase in level_silly_phrases:
+        assert phrase in content, f"Expected the file to contain {phrase}"
+
+
+def create_instructions_files(
+    memory_management_agent: Agent,
+    level: int,
+    task_ids: list,
+    base_filename: str = "instructions_",
+) -> None:
+    """
+    Creates a series of instructions files for the memory challenge.
+    Args:
+        level:
+        memory_management_agent (Agent)
+        num_files (int)
+        task_ids (list)
+        base_filename (str, optional)
+    """
+    for i in range(1, level + 1):
+        content = generate_content(i, task_ids, base_filename, level)
+        file_name = f"{base_filename}{i}.txt"
+        file_path = get_workspace_path(memory_management_agent, file_name)
+        write_to_file(file_path, content, memory_management_agent)
+
+
+def generate_content(
+    index: int, silly_phrases: list, base_filename: str, level: int
+) -> str:
+    """
+    Args:
+        index: int
+        task_ids: list
+        base_filename: str
+        num_files: int
+
+    Returns: str
+    """
+    phrase = silly_phrases[index - 1]
+    noise = generate_noise(NOISE)
+    if index != level:
+        if level == 1:
+            return f"{noise}\nThe current phrase to remember is '{phrase}'.\n{noise}\nWrite all the phrases into the file output.txt. The file has not been created yet. After that, use the task_complete command."
+        return f"{noise}\nThe current phrase is '{phrase}'.\n{noise}\nRead the file {base_filename}{index + 1}.txt using the read_file command."
+    return f"{noise}\nThis phrase is '{phrase}'\n{noise}\nWrite all the phrases into the file output.txt. The file has not been created yet. After that, use the task_complete command.\n"
--- a/tests/challenges/memory/test_memory_challenge_d.py
+++ b/tests/challenges/memory/test_memory_challenge_d.py
@@ -0,0 +1,241 @@
+import json
+from typing import Dict
+
+import pytest
+from pytest_mock import MockerFixture
+
+from autogpt.agent import Agent
+from autogpt.commands.file_operations import read_file, write_to_file
+from tests.challenges.challenge_decorator.challenge_decorator import challenge
+from tests.challenges.utils import get_workspace_path, run_interaction_loop
+
+LEVEL_CURRENTLY_BEATEN = 1
+MAX_LEVEL = 5
+OUTPUT_LOCATION = "output.txt"
+
+
+@challenge()
+def test_memory_challenge_d(
+    memory_management_agent: Agent,
+    patched_api_requestor: MockerFixture,
+    monkeypatch: pytest.MonkeyPatch,
+    level_to_run: int,
+    challenge_name: str,
+) -> None:
+    """
+    The agent is given a series of events and must remember the respective beliefs of the characters.
+    Args:
+        memory_management_agent (Agent)
+        user_selected_level (int)
+    """
+    sally_anne_test_phrases = [
+        "Sally has a marble (marble A) and she puts it in her basket (basket S), then leaves the room. Anne moves marble A from Sally's basket (basket S) to her own basket (basket A).",
+        "Sally gives a new marble (marble B) to Bob who is outside with her. Bob goes into the room and places marble B into Anne's basket (basket A). Anne tells Bob to tell Sally that he lost the marble b. Bob leaves the room and speaks to Sally about the marble B. Meanwhile, after Bob left the room, Anne moves marble A into the green box, but tells Charlie to tell Sally that marble A is under the sofa. Charlie leaves the room and speaks to Sally about the marble A as instructed by Anne.",
+        "Sally gives a new marble (marble C) to Charlie who is outside with her. Charlie enters the room and exchanges marble C with marble B in Anne's basket (basket A). Anne tells Charlie to tell Sally that he put marble C into the red box. Charlie leaves the room and speak to Sally about marble C as instructed by Anne. Meanwhile, after Charlie leaves the room, Bob enters into the room and moves marble A from the green box to under the sofa, but tells Anne to tell Sally that marble A is in the green box. Anne leaves the room and speak to Sally about the marble A as instructed by Bob",
+        "Sally gives a new marble (marble D) to Anne. Anne gives the marble to Charlie. Charlie enters the room and gives marble D to Bob. Bob tells Charlie to tell Sally that he put marble D under the sofa. Bob put marble D under the sofa Charlie leaves the room and speaks to Sally about marble D. Meanwhile, after Charlie leaves the room, Bob takes marble A from under the sofa and places it in the blue box.",
+        "Sally gives a new marble (marble E) to Charlie who is outside with her. Charlie enters the room and places marble E in the red box. Anne, who is already in the room, takes marble E from the red box, and hides it under the sofa. Then Anne leaves the room and tells Sally that marble E is in the green box. Meanwhile, after Anne leaves the room, Charlie who re-enters the room takes marble D from under the sofa and places it in his own basket (basket C).",
+    ]
+    level_sally_anne_test_phrases = sally_anne_test_phrases[:level_to_run]
+    create_instructions_files(
+        memory_management_agent, level_to_run, level_sally_anne_test_phrases
+    )
+    run_interaction_loop(
+        monkeypatch,
+        memory_management_agent,
+        level_to_run + 2,
+        challenge_name,
+        level_to_run,
+    )
+    file_path = get_workspace_path(memory_management_agent, OUTPUT_LOCATION)
+
+    content = read_file(file_path, memory_management_agent)
+    check_beliefs(content, level_to_run)
+
+
+def check_beliefs(content: str, level: int) -> None:
+    # Define the expected beliefs for each level
+    expected_beliefs = {
+        1: {
+            "Sally": {
+                "marble A": "basket S",
+            },
+            "Anne": {
+                "marble A": "basket A",
+            },
+        },
+        2: {
+            "Sally": {
+                "marble A": "sofa",  # Because Charlie told her
+                "marble B": "lost",  # Because Bob told her
+            },
+            "Anne": {
+                "marble A": "green box",  # Because she moved it there
+                "marble B": "basket A",  # Because Bob put it there and she was in the room
+            },
+            "Bob": {
+                "marble B": "basket A",  # Last place he put it
+            },
+            "Charlie": {
+                "marble A": "sofa",  # Because Anne told him to tell Sally so
+            },
+        },
+        3: {
+            "Sally": {
+                "marble A": "green box",  # Because Anne told her
+                "marble C": "red box",  # Because Charlie told her
+            },
+            "Anne": {
+                "marble A": "sofa",  # Because Bob moved it there and told her
+                "marble B": "basket A",  # Because Charlie exchanged marble C with marble B in her basket
+                "marble C": "basket A",  # Because Charlie exchanged marble C with marble B in her basket
+            },
+            "Bob": {
+                "marble A": "sofa",  # Because he moved it there
+                "marble B": "basket A",
+                # Because Charlie exchanged marble C with marble B in Anne's basket, and he was in the room
+                "marble C": "basket A",
+                # Because Charlie exchanged marble C with marble B in Anne's basket, and he was in the room
+            },
+            "Charlie": {
+                "marble A": "sofa",  # Last place he knew it was
+                "marble B": "basket A",  # Because he exchanged marble C with marble B in Anne's basket
+                "marble C": "red box",  # Because Anne told him to tell Sally so
+            },
+        },
+        4: {
+            "Sally": {
+                "marble A": "green box",  # Because Anne told her in the last conversation
+                "marble C": "red box",  # Because Charlie told her
+                "marble D": "sofa",  # Because Charlie told her
+            },
+            "Anne": {
+                "marble A": "blue box",  # Because Bob moved it there, and she was not in the room to see
+                "marble B": "basket A",  # Last place she knew it was
+                "marble C": "basket A",  # Last place she knew it was
+                "marble D": "sofa",  # Because Bob moved it there, and she was in the room to see
+            },
+            "Bob": {
+                "marble A": "blue box",  # Because he moved it there
+                "marble B": "basket A",  # Last place he knew it was
+                "marble C": "basket A",  # Last place he knew it was
+                "marble D": "sofa",  # Because he moved it there
+            },
+            "Charlie": {
+                "marble A": "sofa",  # Last place he knew it was
+                "marble B": "basket A",  # Last place he knew it was
+                "marble C": "red box",  # Last place he knew it was
+                "marble D": "sofa",  # Because Bob told him to tell Sally so
+            },
+        },
+        5: {
+            "Sally": {
+                "marble A": "green box",  # Because Anne told her in the last level
+                "marble C": "red box",  # Because Charlie told her
+                "marble D": "sofa",  # Because Charlie told her
+                "marble E": "green box",  # Because Anne told her
+            },
+            "Anne": {
+                "marble A": "blue box",  # Last place she knew it was
+                "marble B": "basket A",  # Last place she knew it was
+                "marble C": "basket A",  # Last place she knew it was
+                "marble D": "basket C",  # Last place she knew it was
+                "marble E": "sofa",  # Because she moved it there
+            },
+            "Charlie": {
+                "marble A": "blue box",  # Last place he knew it was
+                "marble B": "basket A",  # Last place he knew it was
+                "marble C": "basket A",  # Last place he knew it was
+                "marble D": "basket C",  # Because he moved it there
+                "marble E": "red box",  # Last place he knew it was
+            },
+            "Bob": {
+                "marble A": "blue box",  # Last place he knew it was
+                "marble C": "red box",  # Last place he knew it was
+                "marble D": "sofa",  # Last place he knew it was
+            },
+        },
+    }
+
+    # Extract the beliefs from the AI's response
+    ai_beliefs = extract_beliefs(content)
+    # Check the AI's beliefs against the expected beliefs
+    for character, belief in expected_beliefs[level].items():
+        for marble, location in belief.items():
+            ai_belief = ai_beliefs.get(character, {}).get(marble, "")
+            assert (
+                location in ai_belief
+            ), f"For {character}'s {marble}, expected '{location}' to be in '{ai_belief}'"
+
+
+def extract_beliefs(content: str) -> Dict[str, Dict[str, str]]:
+    """Extract the beliefs of each character from the AI's output."""
+    # Parse the JSON content
+    content_dict = json.loads(content)
+    beliefs = content_dict.get("beliefs", {})
+    return beliefs
+
+
+def create_instructions_files(
+    memory_management_agent: Agent,
+    level: int,
+    test_phrases: list,
+    base_filename: str = "instructions_",
+) -> None:
+    """
+    Creates a series of instructions files for the memory challenge.
+    Args:
+        level:
+        memory_management_agent (Agent)
+        test_phrases (list)
+        base_filename (str, optional)
+    """
+    for i in range(1, level + 1):
+        content = generate_content(i, test_phrases, base_filename, level)
+        file_name = f"{base_filename}{i}.txt"
+        file_path = get_workspace_path(memory_management_agent, file_name)
+        write_to_file(file_path, content, memory_management_agent)
+
+
+def generate_content(
+    index: int, test_phrases: list, base_filename: str, level: int
+) -> str:
+    """
+    Args:
+        index: int
+        test_phrases: list
+        base_filename: str
+        level: int
+    Returns: str
+    """
+    phrase = test_phrases[index - 1]
+
+    if index != level:
+        return f"The current event is '{phrase}'.\nRead the file {base_filename}{index + 1}.txt using the read_file command."
+    else:
+        return f"""
+        This event is '{phrase}'
+        rules:
+        1. Everyone inside the room see and know what are the actions of the people inside the room.
+        2. Everyone outside the room do not see and do not know the actions of the people inside the room.
+        3. Only write about the marbles that are present in the level
+        4. Marbles names are marble A, marble B, marble C, marble D, ...
+        5. Believed_location_of_the_specific_marble describes where is the marble like drawer, basket S, sofa, ...
+        6. Do not use expression like <Anne's basket> use <basket A> instead.
+        7. All characters believe what they're being told by other characters.
+        Instructions:
+        I) Write the following information in the file output.txt in JSON format:
+        1. The respective beliefs of the characters (which means where every marble is according to character x, y, z. Character x should say where it believes every marble it is aware exist is)
+        The format should be as follows:
+        {{
+            "beliefs": {{
+                "<character_name>": {{
+                    "<marble_name>": "<believed_location_of_the_specific_marble>",
+                    ...
+                }},
+                ...
+            }},
+        }}
+        Example of output (only use this to understand and learn how to use the format. Nothing else):
+        {{"beliefs": {{"Sally": {{"marble A": "basket A"}}, "Bob": {{"marble B": "basket S"}}, "Anne": {{"marble A": "green box"}}, "Charlie": {{"marble B": "sofa"}}}}
+        II) The file output.txt has not been created yet. You need to create it. After that, use the task_complete command.
+        """
--- a/tests/challenges/test_challenge_should_be_formatted_properly.py
+++ b/tests/challenges/test_challenge_should_be_formatted_properly.py
@@ -0,0 +1,59 @@
+import importlib.util
+import inspect
+import os
+from types import ModuleType
+from typing import List
+
+# Path to the challenges folder
+CHALLENGES_DIR = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "../challenges"
+)
+
+
+def get_python_files(directory: str, exclude_file: str) -> List[str]:
+    """Recursively get all python files in a directory and subdirectories."""
+    python_files: List[str] = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if (
+                file.endswith(".py")
+                and file.startswith("test_")
+                and file != exclude_file
+            ):
+                python_files.append(os.path.join(root, file))
+    return python_files
+
+
+def load_module_from_file(test_file: str) -> ModuleType:
+    spec = importlib.util.spec_from_file_location("module.name", test_file)
+    assert spec is not None, f"Unable to get spec for module in file {test_file}"
+    module = importlib.util.module_from_spec(spec)
+    assert (
+        spec.loader is not None
+    ), f"Unable to get loader for module in file {test_file}"
+    spec.loader.exec_module(module)
+    return module
+
+
+def get_test_functions(module: ModuleType) -> List:
+    return [
+        o
+        for o in inspect.getmembers(module)
+        if inspect.isfunction(o[1]) and o[0].startswith("test_")
+    ]
+
+
+def assert_single_test_function(functions_list: List, test_file: str) -> None:
+    assert len(functions_list) == 1, f"{test_file} should contain only one function"
+    assert (
+        functions_list[0][0][5:] == os.path.basename(test_file)[5:-3]
+    ), f"The function in {test_file} should have the same name as the file without 'test_' prefix"
+
+
+def test_method_name_and_count() -> None:
+    current_file: str = os.path.basename(__file__)
+    test_files: List[str] = get_python_files(CHALLENGES_DIR, current_file)
+    for test_file in test_files:
+        module = load_module_from_file(test_file)
+        functions_list = get_test_functions(module)
+        assert_single_test_function(functions_list, test_file)
--- a/tests/challenges/utils.py
+++ b/tests/challenges/utils.py
@@ -0,0 +1,76 @@
+import contextlib
+import random
+import shutil
+from pathlib import Path
+from typing import Any, Generator
+
+import pytest
+
+from autogpt.agent import Agent
+from autogpt.log_cycle.log_cycle import LogCycleHandler
+
+
+def generate_noise(noise_size: int) -> str:
+    random.seed(42)
+    return "".join(
+        random.choices(
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
+            k=noise_size,
+        )
+    )
+
+
+def setup_mock_input(monkeypatch: pytest.MonkeyPatch, cycle_count: int) -> None:
+    """
+    Sets up the mock input for testing.
+
+    :param monkeypatch: pytest's monkeypatch utility for modifying builtins.
+    :param cycle_count: The number of cycles to mock.
+    """
+    input_sequence = ["y"] * (cycle_count) + ["EXIT"]
+
+    def input_generator() -> Generator[str, None, None]:
+        """
+        Creates a generator that yields input strings from the given sequence.
+        """
+        yield from input_sequence
+
+    gen = input_generator()
+    monkeypatch.setattr("autogpt.utils.session.prompt", lambda _: next(gen))
+
+
+def run_interaction_loop(
+    monkeypatch: pytest.MonkeyPatch,
+    agent: Agent,
+    cycle_count: int,
+    challenge_name: str,
+    level_to_run: int,
+) -> None:
+    setup_mock_input(monkeypatch, cycle_count)
+
+    setup_mock_log_cycle_agent_name(monkeypatch, challenge_name, level_to_run)
+    with contextlib.suppress(SystemExit):
+        agent.start_interaction_loop()
+
+
+def setup_mock_log_cycle_agent_name(
+    monkeypatch: pytest.MonkeyPatch, challenge_name: str, level_to_run: int
+) -> None:
+    def mock_get_agent_short_name(*args: Any, **kwargs: Any) -> str:
+        return f"{challenge_name}_level_{level_to_run}"
+
+    monkeypatch.setattr(
+        LogCycleHandler, "get_agent_short_name", mock_get_agent_short_name
+    )
+
+
+def get_workspace_path(agent: Agent, file_name: str) -> str:
+    return str(agent.workspace.get_path(file_name))
+
+
+def copy_file_into_workspace(
+    agent: Agent, directory_path: Path, file_path: str
+) -> None:
+    workspace_code_file_path = get_workspace_path(agent, file_path)
+    code_file_path = directory_path / file_path
+    shutil.copy(code_file_path, workspace_code_file_path)
--- a/tests/challenges/utils/build_current_score.py
+++ b/tests/challenges/utils/build_current_score.py
@@ -0,0 +1,44 @@
+import glob
+import json
+import os
+from typing import Any, Dict
+
+
+def deep_merge(source: Dict[Any, Any], dest: Dict[Any, Any]) -> Dict[Any, Any]:
+    for key, value in source.items():
+        if isinstance(value, Dict):
+            dest[key] = deep_merge(value, dest.get(key, {}))
+        else:
+            dest[key] = value
+    return dest
+
+
+import collections
+
+
+def recursive_sort_dict(data: dict) -> dict:
+    for key, value in data.items():
+        if isinstance(value, dict):
+            data[key] = recursive_sort_dict(value)
+    return collections.OrderedDict(sorted(data.items()))
+
+    # setup
+
+
+cwd = os.getcwd()  # get current working directory
+new_score_filename_pattern = os.path.join(cwd, "tests/challenges/new_score_*.json")
+current_score_filename = os.path.join(cwd, "tests/challenges/current_score.json")
+
+merged_data: Dict[str, Any] = {}
+for filename in glob.glob(new_score_filename_pattern):
+    with open(filename, "r") as f_new:
+        data = json.load(f_new)
+    merged_data = deep_merge(
+        data, merged_data
+    )  # deep merge the new data with the merged data
+    os.remove(filename)  # remove the individual file
+sorted_data = recursive_sort_dict(merged_data)
+
+with open(current_score_filename, "w") as f_current:
+    json_data = json.dumps(sorted_data, indent=4)
+    f_current.write(json_data + "\n")