Fix "code.py" conflict with Python's code module, and fix TestReturnCode_Simple conflict between two test.py files. (#321)

Co-authored-by: Luke <2609441+lc0rp@user.noreply.github.com> Co-authored-by: merwanehamadi <merwanehamadi@gmail.com>
2026-01-10 07:38:04 -05:00 · 2023-08-19 09:04:18 -07:00
parent ebec2ac813
commit 9f1631719c
50 changed files with 60 additions and 31 deletions
--- a/agbenchmark/challenges/SUITES.md
+++ b/agbenchmark/challenges/SUITES.md
@@ -87,7 +87,7 @@ The structure for a non same_task report looks like this:
                        "code",
                        "iterate"
                    ],
-                    "task": "Return the multiplied number in the function multiply_int in code.py. You can make sure you have correctly done this by running test.py",
+                    "task": "Return the multiplied number in the function multiply_int in sample_code.py. You can make sure you have correctly done this by running test.py",
                    "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
                    "description": "Simple test if a simple code instruction can be executed",
                    "metrics": {
@@ -106,7 +106,7 @@ The structure for a non same_task report looks like this:
                        "code",
                        "iterate"
                    ],
-                    "task": "Add a function called multiply_int in code.py that multiplies numbers by 2. You can make sure you have correctly done this by running test.py",
+                    "task": "Add a function called multiply_int in sample_code.py that multiplies numbers by 2. You can make sure you have correctly done this by running test.py",
                    "answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
                    "description": "Small step up, just writing the function with a name as well as the return statement.",
                    "metrics": {
--- a/agbenchmark/challenges/adapatability/a1_debug/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/adapatability/a1_debug/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/adapatability/a1_debug/artifacts_in/test.py
+++ b/agbenchmark/challenges/adapatability/a1_debug/artifacts_in/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/adapatability/a1_debug/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/adapatability/a1_debug/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/adapatability/a1_debug/artifacts_out/test.py
+++ b/agbenchmark/challenges/adapatability/a1_debug/artifacts_out/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/adapatability/a1_debug/data.json
+++ b/agbenchmark/challenges/adapatability/a1_debug/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestAdaptSimpleTypoWithGuidance",
  "category": ["adaptability"],
-  "task": "1- Run code.py.\n2- Read test.py.\n3- Modify test.py.\nRepeat step 1, 2 and 3 until code.py runs without errors.\n",
+  "task": "1- Run sample_code.py.\n2- Read test.py.\n3- Modify test.py.\nRepeat step 1, 2 and 3 until sample_code.py runs without errors.\n",
  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
  "cutoff": 75,
  "ground": {
--- a/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_in/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/1_return/artifacts_out/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/1_return/data.json
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/1_return/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestReturnCode_Simple",
  "category": ["code", "iterate"],
-  "task": "Return the multiplied number in the function multiply_int in code.py. You can make sure you have correctly done this by running test.py",
+  "task": "Return the multiplied number in the function multiply_int in sample_code.py. You can make sure you have correctly done this by running test.py",
  "dependencies": ["TestReadFile"],
  "cutoff": 120,
  "ground": {
--- a/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_in/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/2_write/artifacts_out/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/2_write/data.json
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/2_write/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestReturnCode_Write",
  "category": ["code", "iterate"],
-  "task": "Add a function called multiply_int in code.py that multiplies numbers by 2. You can make sure you have correctly done this by running test.py",
+  "task": "Add a function called multiply_int in sample_code.py that multiplies numbers by 2. You can make sure you have correctly done this by running test.py",
  "dependencies": ["TestReturnCode_Simple"],
  "cutoff": 120,
  "ground": {
--- a/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_in/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/artifacts_out/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/data.json
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/3_modify/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestReturnCode_Modify",
  "category": ["code", "iterate"],
-  "task": "Modify the multiply_int function in code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running test.py",
+  "task": "Modify the multiply_int function in sample_code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running test.py",
  "dependencies": ["TestReturnCode_Write"],
  "cutoff": 120,
  "ground": {
--- a/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_in/testfile.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/artifacts_out/testfile.py
@@ -0,0 +1,18 @@
+# mypy: ignore-errors
+from sample_code import multiply_int
+
+
+def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
+    result = multiply_int(num, multiplier)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case
+    num = 4
+    multiplier = 2
+    expected_result = 8
+    test_multiply_int(num, multiplier, expected_result)
--- a/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/custom_python/test.py
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/custom_python/test.py
@@ -1,5 +1,5 @@
 # mypy: ignore-errors
-from code import multiply_int
+from sample_code import multiply_int


 def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
--- a/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/data.json
+++ b/agbenchmark/challenges/code/c1_writing_suite_1/4_tests/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestReturnCode_Tests",
  "category": ["code", "iterate"],
-  "task": "First, modify test.py to fill in the test case to be able to test the code in code.py. Next, modify the multiply_int function in code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running test.py that you previously modified.",
+  "task": "First, modify testfile.py to fill in the test case to be able to test the code in sample_code.py. Next, modify the multiply_int function in sample_code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running testfile.py that you previously modified.",
  "dependencies": ["TestReturnCode_Modify"],
  "cutoff": 120,
  "ground": {
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_in/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/artifacts_out/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/data.json
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.1_guided/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestDebugSimpleTypoWithGuidance",
  "category": ["code", "iterate"],
-  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "task": "1- Run test.py.\n2- Read sample_code.py.\n3- Modify sample_code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
  "dependencies": ["TestReadFile"],
  "cutoff": 75,
  "ground": {
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_in/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.2_vague/artifacts_out/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.3_import/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.3_import/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/c2_debug_suite/d2.3_import/artifacts_out/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c3_writing_suite_2/d3.1_three_sum/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
+++ b/agbenchmark/challenges/code/c3_writing_suite_2/d3.1_three_sum/custom_python/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import three_sum
 from typing import List

+from sample_code import three_sum
+

 def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
    result = three_sum(nums, target)
--- a/agbenchmark/challenges/code/c3_writing_suite_2/d3.1_three_sum/data.json
+++ b/agbenchmark/challenges/code/c3_writing_suite_2/d3.1_three_sum/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestThreeSum",
  "category": ["code", "iterate"],
-  "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+  "task": "Create a three_sum function in a file called sample_code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
  "dependencies": ["TestFunctionCodeGeneration"],
  "cutoff": 60,
  "ground": {
--- a/agbenchmark/challenges/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/code/c3_writing_suite_2/d3_two_sum/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
+++ b/agbenchmark/challenges/code/c3_writing_suite_2/d3_two_sum/custom_python/test.py
@@ -1,7 +1,8 @@
 # mypy: ignore-errors
-from code import two_sum
 from typing import List

+from sample_code import two_sum
+

 def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
    result = two_sum(nums, target)
--- a/agbenchmark/challenges/code/c3_writing_suite_2/d3_two_sum/data.json
+++ b/agbenchmark/challenges/code/c3_writing_suite_2/d3_two_sum/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestFunctionCodeGeneration",
  "category": ["code"],
-  "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+  "task": "Create a two_sum function in a file called sample_code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
  "dependencies": ["TestReturnCode_Write"],
  "cutoff": 90,
  "ground": {
--- a/agbenchmark/challenges/code/c4_writing_cli_suite_3/1_password_generator/data.json
+++ b/agbenchmark/challenges/code/c4_writing_cli_suite_3/1_password_generator/data.json
@@ -1,7 +1,7 @@
 {
  "name": "TestPasswordGenerator_Easy",
  "category": ["code"],
-  "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py",
+  "task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. If no length is specified, the password should be 8 characters long. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x).",
  "dependencies": ["TestWriteFile"],
  "cutoff": 90,
  "ground": {
@@ -15,7 +15,7 @@
  },
  "info": {
    "difficulty": "basic",
-    "description": "Tests ability for the agent to code a file organizer.",
+    "description": "Tests ability for the agent to create a random password generator.",
    "side_effects": []
  }
 }
--- a/agbenchmark/challenges/ethereum/a1_price/artifacts_in/sample_code.py
+++ b/agbenchmark/challenges/ethereum/a1_price/artifacts_in/sample_code.py
--- a/agbenchmark/challenges/ethereum/a1_price/artifacts_in/test.py
+++ b/agbenchmark/challenges/ethereum/a1_price/artifacts_in/test.py
@@ -1,5 +1,6 @@
 import re
-from code import get_ethereum_price
+
+from sample_code import get_ethereum_price


 def test_get_ethereum_price() -> None:
--- a/agbenchmark/challenges/ethereum/a1_price/artifacts_out/sample_code.py
+++ b/agbenchmark/challenges/ethereum/a1_price/artifacts_out/sample_code.py
--- a/agbenchmark/challenges/ethereum/a1_price/artifacts_out/test.py
+++ b/agbenchmark/challenges/ethereum/a1_price/artifacts_out/test.py
@@ -1,5 +1,6 @@
 import re
-from code import get_ethereum_price
+
+from sample_code import get_ethereum_price


 def test_get_ethereum_price() -> None: