mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
test(copilot): add functional tests for dry-run loop beyond substring checks
Add 23 new tests covering: - run_agent and run_block OpenAI tool schema validation (type, optionality, description quality, coexistence of dry_run + wait_for_result) - RunAgentInput Pydantic model behavior (default value, bool coercion, combined parameters, validation bounds, string stripping) - Guide workflow ordering (create before dry-run, dry-run before inspect, fix before repeat, numbered step sequence)
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""Prompt regression tests for the dry-run verification loop.
|
||||
"""Prompt regression tests AND functional tests for the dry-run verification loop.
|
||||
|
||||
These tests verify that the create -> dry-run -> fix iterative workflow is
|
||||
properly communicated through tool descriptions, the prompting supplement,
|
||||
@@ -11,21 +11,48 @@ descriptions no longer repeat it — they keep a minimal footprint.
|
||||
**Intentionally brittle**: the assertions check for specific substrings so
|
||||
that accidental removal or rewording of key instructions is caught. If you
|
||||
deliberately reword a prompt, update the corresponding assertion here.
|
||||
|
||||
--- Functional tests (added separately) ---
|
||||
|
||||
The dry-run loop is primarily a *prompt/guide* feature — the copilot reads
|
||||
the guide and follows its instructions. There are no standalone Python
|
||||
functions that implement "loop until passing" logic; the loop is driven by
|
||||
the LLM. However, several pieces of real Python infrastructure make the
|
||||
loop possible:
|
||||
|
||||
1. The ``run_agent`` and ``run_block`` OpenAI tool schemas expose a
|
||||
``dry_run`` boolean parameter that the LLM must be able to set.
|
||||
2. The ``RunAgentInput`` Pydantic model validates ``dry_run`` as a bool,
|
||||
defaulting to False, so the executor can branch on it.
|
||||
3. The ``_check_prerequisites`` method in ``RunAgentTool`` bypasses
|
||||
credential and missing-input gates when ``dry_run=True``.
|
||||
4. The guide documents the workflow steps in a specific order that the LLM
|
||||
must follow: create/edit -> dry-run -> inspect -> fix -> repeat.
|
||||
|
||||
The functional test classes below exercise items 1-4 directly.
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, cast
|
||||
|
||||
import pytest
|
||||
from openai.types.chat import ChatCompletionToolParam
|
||||
|
||||
from backend.copilot.prompting import get_sdk_supplement
|
||||
from backend.copilot.service import DEFAULT_SYSTEM_PROMPT
|
||||
from backend.copilot.tools import TOOL_REGISTRY
|
||||
from backend.copilot.tools.run_agent import RunAgentInput
|
||||
|
||||
# Resolved once for the whole module so individual tests stay fast.
|
||||
_SDK_SUPPLEMENT = get_sdk_supplement(use_e2b=False, cwd="/tmp/test")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt regression tests (original)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSystemPromptBasics:
|
||||
"""Verify the system prompt includes essential baseline content.
|
||||
|
||||
@@ -122,3 +149,252 @@ class TestAgentBuildingGuideDryRunLoop:
|
||||
def test_workflow_has_dry_run_and_inspect_steps(self, guide_content):
|
||||
assert "**Dry-run**" in guide_content
|
||||
assert "**Inspect & fix**" in guide_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Functional tests: tool schema validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRunAgentToolSchema:
|
||||
"""Validate the run_agent OpenAI tool schema exposes dry_run correctly.
|
||||
|
||||
These go beyond substring checks — they verify the full schema structure
|
||||
that the LLM receives, ensuring the parameter is well-formed and will be
|
||||
parsed correctly by OpenAI function-calling.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def schema(self) -> ChatCompletionToolParam:
|
||||
return TOOL_REGISTRY["run_agent"].as_openai_tool()
|
||||
|
||||
def test_schema_is_valid_openai_tool(self, schema: ChatCompletionToolParam):
|
||||
"""The schema has the required top-level OpenAI structure."""
|
||||
assert schema["type"] == "function"
|
||||
assert "function" in schema
|
||||
func = schema["function"]
|
||||
assert "name" in func
|
||||
assert "description" in func
|
||||
assert "parameters" in func
|
||||
assert func["name"] == "run_agent"
|
||||
|
||||
def test_dry_run_not_required(self, schema: ChatCompletionToolParam):
|
||||
"""dry_run should be optional (not in 'required') so it defaults to false."""
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
required = params.get("required", [])
|
||||
assert "dry_run" not in required
|
||||
|
||||
def test_dry_run_is_boolean_type(self, schema: ChatCompletionToolParam):
|
||||
"""dry_run must be typed as boolean so the LLM generates true/false."""
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
assert params["properties"]["dry_run"]["type"] == "boolean"
|
||||
|
||||
def test_dry_run_description_is_nonempty(self, schema: ChatCompletionToolParam):
|
||||
"""The description must be present and substantive for LLM guidance."""
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
desc = params["properties"]["dry_run"]["description"]
|
||||
assert isinstance(desc, str)
|
||||
assert len(desc) > 20, "Description too short to guide the LLM"
|
||||
|
||||
def test_wait_for_result_coexists_with_dry_run(
|
||||
self, schema: ChatCompletionToolParam
|
||||
):
|
||||
"""wait_for_result must also be present — the guide instructs the LLM
|
||||
to pass both dry_run=True and wait_for_result=120 together."""
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
assert "wait_for_result" in params["properties"]
|
||||
assert params["properties"]["wait_for_result"]["type"] == "integer"
|
||||
|
||||
|
||||
class TestRunBlockToolSchema:
|
||||
"""Validate the run_block OpenAI tool schema exposes dry_run correctly."""
|
||||
|
||||
@pytest.fixture
|
||||
def schema(self) -> ChatCompletionToolParam:
|
||||
return TOOL_REGISTRY["run_block"].as_openai_tool()
|
||||
|
||||
def test_schema_is_valid_openai_tool(self, schema: ChatCompletionToolParam):
|
||||
assert schema["type"] == "function"
|
||||
func = schema["function"]
|
||||
assert func["name"] == "run_block"
|
||||
assert "parameters" in func
|
||||
|
||||
def test_dry_run_exists_and_is_boolean(self, schema: ChatCompletionToolParam):
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
props = params["properties"]
|
||||
assert "dry_run" in props
|
||||
assert props["dry_run"]["type"] == "boolean"
|
||||
|
||||
def test_dry_run_not_required(self, schema: ChatCompletionToolParam):
|
||||
"""dry_run should be optional — block_id and input_data are required."""
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
required = params.get("required", [])
|
||||
assert "dry_run" not in required
|
||||
# block_id and input_data should be required
|
||||
assert "block_id" in required
|
||||
assert "input_data" in required
|
||||
|
||||
def test_dry_run_description_mentions_simulation(
|
||||
self, schema: ChatCompletionToolParam
|
||||
):
|
||||
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
|
||||
desc = params["properties"]["dry_run"]["description"]
|
||||
assert isinstance(desc, str)
|
||||
assert (
|
||||
"simulat" in desc.lower()
|
||||
), "run_block dry_run description should mention simulation"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Functional tests: RunAgentInput Pydantic model
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRunAgentInputModel:
|
||||
"""Validate RunAgentInput Pydantic model handles dry_run correctly.
|
||||
|
||||
The executor reads dry_run from this model, so it must parse, default,
|
||||
and validate properly.
|
||||
"""
|
||||
|
||||
def test_dry_run_defaults_to_false(self):
|
||||
"""When dry_run is omitted, it must default to False."""
|
||||
model = RunAgentInput(username_agent_slug="user/agent")
|
||||
assert model.dry_run is False
|
||||
|
||||
def test_dry_run_accepts_true(self):
|
||||
model = RunAgentInput(username_agent_slug="user/agent", dry_run=True)
|
||||
assert model.dry_run is True
|
||||
|
||||
def test_dry_run_accepts_false(self):
|
||||
model = RunAgentInput(username_agent_slug="user/agent", dry_run=False)
|
||||
assert model.dry_run is False
|
||||
|
||||
def test_dry_run_coerces_truthy_int(self):
|
||||
"""Pydantic bool fields coerce int 1 to True."""
|
||||
model = RunAgentInput(username_agent_slug="user/agent", dry_run=1) # type: ignore[arg-type]
|
||||
assert model.dry_run is True
|
||||
|
||||
def test_dry_run_coerces_falsy_int(self):
|
||||
"""Pydantic bool fields coerce int 0 to False."""
|
||||
model = RunAgentInput(username_agent_slug="user/agent", dry_run=0) # type: ignore[arg-type]
|
||||
assert model.dry_run is False
|
||||
|
||||
def test_dry_run_with_wait_for_result(self):
|
||||
"""The guide instructs passing both dry_run=True and wait_for_result=120.
|
||||
The model must accept this combination."""
|
||||
model = RunAgentInput(
|
||||
username_agent_slug="user/agent",
|
||||
dry_run=True,
|
||||
wait_for_result=120,
|
||||
)
|
||||
assert model.dry_run is True
|
||||
assert model.wait_for_result == 120
|
||||
|
||||
def test_wait_for_result_upper_bound(self):
|
||||
"""wait_for_result is bounded at 300 seconds (ge=0, le=300)."""
|
||||
with pytest.raises(Exception):
|
||||
RunAgentInput(
|
||||
username_agent_slug="user/agent",
|
||||
dry_run=True,
|
||||
wait_for_result=301,
|
||||
)
|
||||
|
||||
def test_string_fields_are_stripped(self):
|
||||
"""The strip_strings validator should strip whitespace from string fields."""
|
||||
model = RunAgentInput(username_agent_slug=" user/agent ", dry_run=True)
|
||||
assert model.username_agent_slug == "user/agent"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Functional tests: guide documents the correct workflow ordering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGuideWorkflowOrdering:
|
||||
"""Verify the guide documents workflow steps in the correct order.
|
||||
|
||||
The LLM must see: create/edit -> dry-run -> inspect -> fix -> repeat.
|
||||
If these steps are reordered, the copilot would follow the wrong sequence.
|
||||
These tests verify *ordering*, not just presence.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def guide_content(self) -> str:
|
||||
guide_path = (
|
||||
Path(__file__).resolve().parent.parent.parent
|
||||
/ "backend"
|
||||
/ "copilot"
|
||||
/ "sdk"
|
||||
/ "agent_generation_guide.md"
|
||||
)
|
||||
return guide_path.read_text(encoding="utf-8")
|
||||
|
||||
def test_create_before_dry_run_in_workflow(self, guide_content: str):
|
||||
"""Step 7 (Save/create_agent) must appear before step 8 (Dry-run)."""
|
||||
create_pos = guide_content.index("create_agent")
|
||||
dry_run_pos = guide_content.index("dry_run=True")
|
||||
assert (
|
||||
create_pos < dry_run_pos
|
||||
), "create_agent must appear before dry_run=True in the workflow"
|
||||
|
||||
def test_dry_run_before_inspect_in_verification_section(self, guide_content: str):
|
||||
"""In the verification loop section, Dry-run step must come before
|
||||
Inspect & fix step."""
|
||||
section_start = guide_content.index("REQUIRED: Dry-Run Verification Loop")
|
||||
section = guide_content[section_start:]
|
||||
dry_run_pos = section.index("**Dry-run**")
|
||||
inspect_pos = section.index("**Inspect")
|
||||
assert (
|
||||
dry_run_pos < inspect_pos
|
||||
), "Dry-run step must come before Inspect & fix in the verification loop"
|
||||
|
||||
def test_fix_before_repeat_in_verification_section(self, guide_content: str):
|
||||
"""The Fix step must come before the Repeat step."""
|
||||
section_start = guide_content.index("REQUIRED: Dry-Run Verification Loop")
|
||||
section = guide_content[section_start:]
|
||||
fix_pos = section.index("**Fix**")
|
||||
repeat_pos = section.index("**Repeat**")
|
||||
assert fix_pos < repeat_pos
|
||||
|
||||
def test_good_output_before_bad_output(self, guide_content: str):
|
||||
"""Good output examples should be listed before bad output examples,
|
||||
so the LLM sees the success pattern first."""
|
||||
good_pos = guide_content.index("**Good output**")
|
||||
bad_pos = guide_content.index("**Bad output**")
|
||||
assert good_pos < bad_pos
|
||||
|
||||
def test_numbered_steps_in_verification_section(self, guide_content: str):
|
||||
"""The step-by-step workflow should have numbered steps 1-5."""
|
||||
section_start = guide_content.index("Step-by-step workflow")
|
||||
section = guide_content[section_start:]
|
||||
# The section should contain numbered items 1 through 5
|
||||
for step_num in range(1, 6):
|
||||
assert (
|
||||
f"{step_num}. " in section
|
||||
), f"Missing numbered step {step_num} in verification workflow"
|
||||
|
||||
def test_workflow_steps_are_in_numbered_order(self, guide_content: str):
|
||||
"""The main workflow steps (1-9) must appear in ascending order."""
|
||||
# Extract the numbered workflow items from the top-level workflow section
|
||||
workflow_start = guide_content.index("### Workflow for Creating/Editing Agents")
|
||||
# End at the next ### section
|
||||
next_section = guide_content.index("### Agent JSON Structure")
|
||||
workflow_section = guide_content[workflow_start:next_section]
|
||||
step_positions = []
|
||||
for step_num in range(1, 10):
|
||||
pattern = rf"^{step_num}\.\s"
|
||||
match = re.search(pattern, workflow_section, re.MULTILINE)
|
||||
if match:
|
||||
step_positions.append((step_num, match.start()))
|
||||
# Verify at least steps 1-9 are present and in order
|
||||
assert (
|
||||
len(step_positions) >= 9
|
||||
), f"Expected 9 workflow steps, found {len(step_positions)}"
|
||||
for i in range(1, len(step_positions)):
|
||||
prev_num, prev_pos = step_positions[i - 1]
|
||||
curr_num, curr_pos = step_positions[i]
|
||||
assert prev_pos < curr_pos, (
|
||||
f"Step {prev_num} (pos {prev_pos}) should appear before "
|
||||
f"step {curr_num} (pos {curr_pos})"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user