fix(copilot): address PR review - reduce prompt redundancy, tighten tests

- Slim down DEFAULT_SYSTEM_PROMPT to a brief one-liner referencing the supplement for detailed workflow (avoids ~300 token duplication) - Tighten test assertions to use specific substring checks (e.g. section headers, exact phrases) instead of loose single-word matches - Restore view_agent_output reference in the agent generation guide for node-by-node execution trace inspection - Add test for view_agent_output mention in guide (22 tests total)
2026-04-08 03:00:28 -04:00 · 2026-03-26 20:35:19 +07:00
parent ac3a826ad0
commit c564ac7277
3 changed files with 44 additions and 60 deletions
--- a/autogpt_platform/backend/backend/copilot/dry_run_loop_test.py
+++ b/autogpt_platform/backend/backend/copilot/dry_run_loop_test.py
@@ -16,31 +16,24 @@ from backend.copilot.tools import TOOL_REGISTRY


 class TestSystemPromptDryRunLoop:
-    """Verify the system prompt includes dry-run loop instructions."""
+    """Verify the system prompt includes a brief dry-run loop reference.
+
+    The detailed workflow lives in the supplement (_SHARED_TOOL_NOTES);
+    the system prompt only carries a short pointer to keep it minimal.
+    """

    def test_system_prompt_mentions_dry_run(self):
-        assert (
-            "dry-run" in DEFAULT_SYSTEM_PROMPT.lower()
-            or "dry_run" in DEFAULT_SYSTEM_PROMPT
-        )
-
-    def test_system_prompt_mentions_create_edit_loop(self):
-        prompt_lower = DEFAULT_SYSTEM_PROMPT.lower()
-        assert "create" in prompt_lower
-        assert "edit_agent" in DEFAULT_SYSTEM_PROMPT or "edit" in prompt_lower
-        assert "loop" in prompt_lower or "repeat" in prompt_lower
-
-    def test_system_prompt_mentions_max_iterations(self):
-        assert "3" in DEFAULT_SYSTEM_PROMPT
-        assert "iteration" in DEFAULT_SYSTEM_PROMPT.lower()
-
-    def test_system_prompt_mentions_inspect_output(self):
-        prompt_lower = DEFAULT_SYSTEM_PROMPT.lower()
-        assert "inspect" in prompt_lower or "check" in prompt_lower
+        assert "dry-run" in DEFAULT_SYSTEM_PROMPT.lower()

    def test_system_prompt_mentions_never_skip(self):
        assert "NEVER skip" in DEFAULT_SYSTEM_PROMPT

+    def test_system_prompt_references_tool_notes(self):
+        assert "tool notes" in DEFAULT_SYSTEM_PROMPT.lower()
+
+    def test_system_prompt_mentions_iterations(self):
+        assert "3 iteration" in DEFAULT_SYSTEM_PROMPT.lower()
+

 class TestToolDescriptionsDryRunLoop:
    """Verify tool descriptions guide the LLM through the dry-run loop."""
@@ -55,13 +48,15 @@ class TestToolDescriptionsDryRunLoop:
        tool = TOOL_REGISTRY["edit_agent"]
        desc = tool.description
        assert "dry_run" in desc or "dry-run" in desc.lower()
-        assert "fix" in desc.lower() or "issues" in desc.lower()
+        assert "dry-run testing" in desc.lower() or "wiring errors" in desc.lower()

    def test_run_agent_mentions_dry_run_for_testing(self):
        tool = TOOL_REGISTRY["run_agent"]
        desc = tool.description
-        assert "dry_run" in desc or "dry-run" in desc.lower()
-        assert "test" in desc.lower() or "verify" in desc.lower()
+        assert "dry_run=True" in desc
+        assert (
+            "test agent wiring" in desc.lower() or "simulates execution" in desc.lower()
+        )

    def test_run_agent_dry_run_param_mentions_workflow(self):
        tool = TOOL_REGISTRY["run_agent"]
@@ -71,12 +66,12 @@ class TestToolDescriptionsDryRunLoop:
        dry_run_desc = params["properties"]["dry_run"]["description"]
        assert "create_agent" in dry_run_desc or "edit_agent" in dry_run_desc
        assert "wait_for_result" in dry_run_desc
-        assert "3" in dry_run_desc  # max iterations
+        assert "3 iterations" in dry_run_desc or "max " in dry_run_desc

    def test_get_agent_building_guide_mentions_workflow(self):
        tool = TOOL_REGISTRY["get_agent_building_guide"]
        desc = tool.description
-        assert "dry-run" in desc.lower() or "dry_run" in desc
+        assert "dry-run" in desc.lower()

    def test_run_agent_dry_run_param_exists(self):
        tool = TOOL_REGISTRY["run_agent"]
@@ -90,26 +85,24 @@ class TestToolDescriptionsDryRunLoop:
 class TestPromptingSupplementDryRunLoop:
    """Verify the prompting supplement includes the iterative workflow."""

-    def test_shared_tool_notes_include_dry_run_section(self):
-        assert (
-            "dry-run" in _SHARED_TOOL_NOTES.lower() or "dry_run" in _SHARED_TOOL_NOTES
-        )
+    def test_shared_tool_notes_include_dry_run_section_header(self):
+        assert "Iterative agent development" in _SHARED_TOOL_NOTES

-    def test_shared_tool_notes_include_loop_workflow(self):
-        notes_lower = _SHARED_TOOL_NOTES.lower()
-        assert "create" in notes_lower
-        assert "fix" in notes_lower
-        assert "iteration" in notes_lower or "repeat" in notes_lower
+    def test_shared_tool_notes_include_create_dry_run_fix_workflow(self):
+        assert "create -> dry-run -> fix" in _SHARED_TOOL_NOTES.lower()

    def test_shared_tool_notes_include_error_patterns(self):
        notes_lower = _SHARED_TOOL_NOTES.lower()
-        assert "error" in notes_lower
-        assert "null" in notes_lower or "empty" in notes_lower
+        assert "errors / failed nodes" in notes_lower
+        assert "null / empty outputs" in notes_lower
+        assert "nodes that never executed" in notes_lower
+
+    def test_shared_tool_notes_include_max_iterations(self):
+        assert "3 times" in _SHARED_TOOL_NOTES or "3 iterations" in _SHARED_TOOL_NOTES

    def test_sdk_supplement_includes_dry_run_section(self):
        supplement = get_sdk_supplement(use_e2b=False, cwd="/tmp/test")
-        supplement_lower = supplement.lower()
-        assert "dry-run" in supplement_lower or "dry_run" in supplement_lower
+        assert "Iterative agent development" in supplement


 class TestAgentBuildingGuideDryRunLoop:
@@ -121,21 +114,23 @@ class TestAgentBuildingGuideDryRunLoop:
        return guide_path.read_text(encoding="utf-8")

    def test_guide_has_dry_run_verification_section(self, guide_content):
-        assert "Dry-Run Verification Loop" in guide_content
+        assert "REQUIRED: Dry-Run Verification Loop" in guide_content

    def test_guide_workflow_includes_dry_run_step(self, guide_content):
-        # Check the workflow section mentions dry-run as a step
-        assert "dry_run=True" in guide_content or "dry_run" in guide_content
+        assert "dry_run=True" in guide_content

    def test_guide_mentions_good_vs_bad_output(self, guide_content):
-        assert "Good output" in guide_content or "good" in guide_content.lower()
-        assert "Bad output" in guide_content or "bad" in guide_content.lower()
+        assert "**Good output**" in guide_content
+        assert "**Bad output**" in guide_content

    def test_guide_mentions_max_iterations(self, guide_content):
-        assert "3 times" in guide_content or "3 iterations" in guide_content
+        assert "**3 times**" in guide_content

    def test_guide_mentions_wait_for_result(self, guide_content):
-        assert "wait_for_result" in guide_content
+        assert "wait_for_result=120" in guide_content
+
+    def test_guide_mentions_view_agent_output(self, guide_content):
+        assert "view_agent_output" in guide_content

    def test_guide_workflow_has_steps_8_and_9(self, guide_content):
        assert "8. **Dry-run**" in guide_content
--- a/autogpt_platform/backend/backend/copilot/sdk/agent_generation_guide.md
+++ b/autogpt_platform/backend/backend/copilot/sdk/agent_generation_guide.md
@@ -248,7 +248,10 @@ user the agent is ready. NEVER skip this step.
   and realistic sample inputs that exercise every path in the agent. This
   simulates execution using an LLM for each block — no real API calls,
   credentials, or credits are consumed.
-3. **Inspect output**: Examine the dry-run result for problems:
+3. **Inspect output**: Examine the dry-run result for problems. If
+   `wait_for_result` returns only a summary, call
+   `view_agent_output(execution_id=..., show_execution_details=True)` to
+   see the full node-by-node execution trace. Look for:
   - **Errors or failed nodes** — a node raised an exception or returned an
     error status. Common causes: wrong `source_name`/`sink_name` in links,
     missing `input_default` values, or referencing a nonexistent block output.
--- a/autogpt_platform/backend/backend/copilot/service.py
+++ b/autogpt_platform/backend/backend/copilot/service.py
@@ -63,21 +63,7 @@ Your goal is to help users automate tasks by:
 - Building and running working automations
 - Delivering tangible value through action, not just explanation

-## Agent Development: Create -> Dry-Run -> Fix Loop
-
-When creating or editing an agent, ALWAYS follow this iterative workflow:
-
-1. **Create/Edit** the agent using `create_agent` or `edit_agent`.
-2. **Dry-run** the agent immediately after saving by calling `run_agent` with `dry_run=True` and `wait_for_result=120`. Provide realistic sample inputs so every branch of the agent is exercised.
-3. **Inspect** the dry-run result. Look for:
-   - Errors or failed nodes in the execution output
-   - Missing or incorrectly wired inputs/outputs
-   - Unexpected null or empty values flowing between nodes
-   - Nodes that did not execute at all (wiring gaps)
-4. **Fix** any issues found by calling `edit_agent` with corrected agent JSON.
-5. **Repeat** steps 2-4 until the dry-run produces clean, expected results. Stop after at most 3 fix iterations to avoid excessive looping — if issues persist after 3 attempts, present the remaining problems to the user and ask for guidance.
-
-This loop catches wiring mistakes, missing inputs, and structural issues before the user runs the agent with real data and credentials. NEVER skip the dry-run step after creating or editing an agent.
+After creating or editing an agent, ALWAYS dry-run it with `run_agent(dry_run=True, wait_for_result=120)` to verify it works, then fix and re-test (up to 3 iterations). NEVER skip the dry-run step. See tool notes for the full workflow.

 Be concise, proactive, and action-oriented. Bias toward showing working solutions over lengthy explanations."""