test(copilot): add functional tests for dry-run loop beyond substring checks

Add 23 new tests covering:
- run_agent and run_block OpenAI tool schema validation (type, optionality,
  description quality, coexistence of dry_run + wait_for_result)
- RunAgentInput Pydantic model behavior (default value, bool coercion,
  combined parameters, validation bounds, string stripping)
- Guide workflow ordering (create before dry-run, dry-run before inspect,
  fix before repeat, numbered step sequence)
This commit is contained in:
Zamil Majdy
2026-03-29 19:37:26 +02:00
parent 65108c31dc
commit 9966e122ab

View File

@@ -1,4 +1,4 @@
"""Prompt regression tests for the dry-run verification loop.
"""Prompt regression tests AND functional tests for the dry-run verification loop.
These tests verify that the create -> dry-run -> fix iterative workflow is
properly communicated through tool descriptions, the prompting supplement,
@@ -11,21 +11,48 @@ descriptions no longer repeat it — they keep a minimal footprint.
**Intentionally brittle**: the assertions check for specific substrings so
that accidental removal or rewording of key instructions is caught. If you
deliberately reword a prompt, update the corresponding assertion here.
--- Functional tests (added separately) ---
The dry-run loop is primarily a *prompt/guide* feature — the copilot reads
the guide and follows its instructions. There are no standalone Python
functions that implement "loop until passing" logic; the loop is driven by
the LLM. However, several pieces of real Python infrastructure make the
loop possible:
1. The ``run_agent`` and ``run_block`` OpenAI tool schemas expose a
``dry_run`` boolean parameter that the LLM must be able to set.
2. The ``RunAgentInput`` Pydantic model validates ``dry_run`` as a bool,
defaulting to False, so the executor can branch on it.
3. The ``_check_prerequisites`` method in ``RunAgentTool`` bypasses
credential and missing-input gates when ``dry_run=True``.
4. The guide documents the workflow steps in a specific order that the LLM
must follow: create/edit -> dry-run -> inspect -> fix -> repeat.
The functional test classes below exercise items 1-4 directly.
"""
import re
from pathlib import Path
from typing import Any, cast
import pytest
from openai.types.chat import ChatCompletionToolParam
from backend.copilot.prompting import get_sdk_supplement
from backend.copilot.service import DEFAULT_SYSTEM_PROMPT
from backend.copilot.tools import TOOL_REGISTRY
from backend.copilot.tools.run_agent import RunAgentInput
# Resolved once for the whole module so individual tests stay fast.
_SDK_SUPPLEMENT = get_sdk_supplement(use_e2b=False, cwd="/tmp/test")
# ---------------------------------------------------------------------------
# Prompt regression tests (original)
# ---------------------------------------------------------------------------
class TestSystemPromptBasics:
"""Verify the system prompt includes essential baseline content.
@@ -122,3 +149,252 @@ class TestAgentBuildingGuideDryRunLoop:
def test_workflow_has_dry_run_and_inspect_steps(self, guide_content):
assert "**Dry-run**" in guide_content
assert "**Inspect & fix**" in guide_content
# ---------------------------------------------------------------------------
# Functional tests: tool schema validation
# ---------------------------------------------------------------------------
class TestRunAgentToolSchema:
"""Validate the run_agent OpenAI tool schema exposes dry_run correctly.
These go beyond substring checks — they verify the full schema structure
that the LLM receives, ensuring the parameter is well-formed and will be
parsed correctly by OpenAI function-calling.
"""
@pytest.fixture
def schema(self) -> ChatCompletionToolParam:
return TOOL_REGISTRY["run_agent"].as_openai_tool()
def test_schema_is_valid_openai_tool(self, schema: ChatCompletionToolParam):
"""The schema has the required top-level OpenAI structure."""
assert schema["type"] == "function"
assert "function" in schema
func = schema["function"]
assert "name" in func
assert "description" in func
assert "parameters" in func
assert func["name"] == "run_agent"
def test_dry_run_not_required(self, schema: ChatCompletionToolParam):
"""dry_run should be optional (not in 'required') so it defaults to false."""
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
required = params.get("required", [])
assert "dry_run" not in required
def test_dry_run_is_boolean_type(self, schema: ChatCompletionToolParam):
"""dry_run must be typed as boolean so the LLM generates true/false."""
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
assert params["properties"]["dry_run"]["type"] == "boolean"
def test_dry_run_description_is_nonempty(self, schema: ChatCompletionToolParam):
"""The description must be present and substantive for LLM guidance."""
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
desc = params["properties"]["dry_run"]["description"]
assert isinstance(desc, str)
assert len(desc) > 20, "Description too short to guide the LLM"
def test_wait_for_result_coexists_with_dry_run(
self, schema: ChatCompletionToolParam
):
"""wait_for_result must also be present — the guide instructs the LLM
to pass both dry_run=True and wait_for_result=120 together."""
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
assert "wait_for_result" in params["properties"]
assert params["properties"]["wait_for_result"]["type"] == "integer"
class TestRunBlockToolSchema:
"""Validate the run_block OpenAI tool schema exposes dry_run correctly."""
@pytest.fixture
def schema(self) -> ChatCompletionToolParam:
return TOOL_REGISTRY["run_block"].as_openai_tool()
def test_schema_is_valid_openai_tool(self, schema: ChatCompletionToolParam):
assert schema["type"] == "function"
func = schema["function"]
assert func["name"] == "run_block"
assert "parameters" in func
def test_dry_run_exists_and_is_boolean(self, schema: ChatCompletionToolParam):
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
props = params["properties"]
assert "dry_run" in props
assert props["dry_run"]["type"] == "boolean"
def test_dry_run_not_required(self, schema: ChatCompletionToolParam):
"""dry_run should be optional — block_id and input_data are required."""
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
required = params.get("required", [])
assert "dry_run" not in required
# block_id and input_data should be required
assert "block_id" in required
assert "input_data" in required
def test_dry_run_description_mentions_simulation(
self, schema: ChatCompletionToolParam
):
params = cast(dict[str, Any], schema["function"].get("parameters", {}))
desc = params["properties"]["dry_run"]["description"]
assert isinstance(desc, str)
assert (
"simulat" in desc.lower()
), "run_block dry_run description should mention simulation"
# ---------------------------------------------------------------------------
# Functional tests: RunAgentInput Pydantic model
# ---------------------------------------------------------------------------
class TestRunAgentInputModel:
"""Validate RunAgentInput Pydantic model handles dry_run correctly.
The executor reads dry_run from this model, so it must parse, default,
and validate properly.
"""
def test_dry_run_defaults_to_false(self):
"""When dry_run is omitted, it must default to False."""
model = RunAgentInput(username_agent_slug="user/agent")
assert model.dry_run is False
def test_dry_run_accepts_true(self):
model = RunAgentInput(username_agent_slug="user/agent", dry_run=True)
assert model.dry_run is True
def test_dry_run_accepts_false(self):
model = RunAgentInput(username_agent_slug="user/agent", dry_run=False)
assert model.dry_run is False
def test_dry_run_coerces_truthy_int(self):
"""Pydantic bool fields coerce int 1 to True."""
model = RunAgentInput(username_agent_slug="user/agent", dry_run=1) # type: ignore[arg-type]
assert model.dry_run is True
def test_dry_run_coerces_falsy_int(self):
"""Pydantic bool fields coerce int 0 to False."""
model = RunAgentInput(username_agent_slug="user/agent", dry_run=0) # type: ignore[arg-type]
assert model.dry_run is False
def test_dry_run_with_wait_for_result(self):
"""The guide instructs passing both dry_run=True and wait_for_result=120.
The model must accept this combination."""
model = RunAgentInput(
username_agent_slug="user/agent",
dry_run=True,
wait_for_result=120,
)
assert model.dry_run is True
assert model.wait_for_result == 120
def test_wait_for_result_upper_bound(self):
"""wait_for_result is bounded at 300 seconds (ge=0, le=300)."""
with pytest.raises(Exception):
RunAgentInput(
username_agent_slug="user/agent",
dry_run=True,
wait_for_result=301,
)
def test_string_fields_are_stripped(self):
"""The strip_strings validator should strip whitespace from string fields."""
model = RunAgentInput(username_agent_slug=" user/agent ", dry_run=True)
assert model.username_agent_slug == "user/agent"
# ---------------------------------------------------------------------------
# Functional tests: guide documents the correct workflow ordering
# ---------------------------------------------------------------------------
class TestGuideWorkflowOrdering:
"""Verify the guide documents workflow steps in the correct order.
The LLM must see: create/edit -> dry-run -> inspect -> fix -> repeat.
If these steps are reordered, the copilot would follow the wrong sequence.
These tests verify *ordering*, not just presence.
"""
@pytest.fixture
def guide_content(self) -> str:
guide_path = (
Path(__file__).resolve().parent.parent.parent
/ "backend"
/ "copilot"
/ "sdk"
/ "agent_generation_guide.md"
)
return guide_path.read_text(encoding="utf-8")
def test_create_before_dry_run_in_workflow(self, guide_content: str):
"""Step 7 (Save/create_agent) must appear before step 8 (Dry-run)."""
create_pos = guide_content.index("create_agent")
dry_run_pos = guide_content.index("dry_run=True")
assert (
create_pos < dry_run_pos
), "create_agent must appear before dry_run=True in the workflow"
def test_dry_run_before_inspect_in_verification_section(self, guide_content: str):
"""In the verification loop section, Dry-run step must come before
Inspect & fix step."""
section_start = guide_content.index("REQUIRED: Dry-Run Verification Loop")
section = guide_content[section_start:]
dry_run_pos = section.index("**Dry-run**")
inspect_pos = section.index("**Inspect")
assert (
dry_run_pos < inspect_pos
), "Dry-run step must come before Inspect & fix in the verification loop"
def test_fix_before_repeat_in_verification_section(self, guide_content: str):
"""The Fix step must come before the Repeat step."""
section_start = guide_content.index("REQUIRED: Dry-Run Verification Loop")
section = guide_content[section_start:]
fix_pos = section.index("**Fix**")
repeat_pos = section.index("**Repeat**")
assert fix_pos < repeat_pos
def test_good_output_before_bad_output(self, guide_content: str):
"""Good output examples should be listed before bad output examples,
so the LLM sees the success pattern first."""
good_pos = guide_content.index("**Good output**")
bad_pos = guide_content.index("**Bad output**")
assert good_pos < bad_pos
def test_numbered_steps_in_verification_section(self, guide_content: str):
"""The step-by-step workflow should have numbered steps 1-5."""
section_start = guide_content.index("Step-by-step workflow")
section = guide_content[section_start:]
# The section should contain numbered items 1 through 5
for step_num in range(1, 6):
assert (
f"{step_num}. " in section
), f"Missing numbered step {step_num} in verification workflow"
def test_workflow_steps_are_in_numbered_order(self, guide_content: str):
"""The main workflow steps (1-9) must appear in ascending order."""
# Extract the numbered workflow items from the top-level workflow section
workflow_start = guide_content.index("### Workflow for Creating/Editing Agents")
# End at the next ### section
next_section = guide_content.index("### Agent JSON Structure")
workflow_section = guide_content[workflow_start:next_section]
step_positions = []
for step_num in range(1, 10):
pattern = rf"^{step_num}\.\s"
match = re.search(pattern, workflow_section, re.MULTILINE)
if match:
step_positions.append((step_num, match.start()))
# Verify at least steps 1-9 are present and in order
assert (
len(step_positions) >= 9
), f"Expected 9 workflow steps, found {len(step_positions)}"
for i in range(1, len(step_positions)):
prev_num, prev_pos = step_positions[i - 1]
curr_num, curr_pos = step_positions[i]
assert prev_pos < curr_pos, (
f"Step {prev_num} (pos {prev_pos}) should appear before "
f"step {curr_num} (pos {curr_pos})"
)