fix(copilot): fix dry-run simulation showing INCOMPLETE/error status (#12580)

## Summary
- **Backend**: Strip empty `error` pins from dry-run simulation outputs
that the simulator always includes (set to `""` meaning "no error").
This was causing the LLM to misinterpret successful simulations as
failures and report "INCOMPLETE" status to users
- **Backend**: Add explicit "Status: COMPLETED" to dry-run response
message to prevent LLM misinterpretation
- **Backend**: Update simulation prompt to exclude `error` from the
"MUST include" keys list, and instruct LLM to omit error unless
simulating a logical failure
- **Frontend**: Fix `isRunBlockErrorOutput()` type guard that was too
broad (`"error" in output` matched BlockOutputResponse objects, not just
ErrorResponse), causing dry-run results to be displayed as errors
- **Frontend**: Fix `parseOutput()` fallback matching to not classify
BlockOutputResponse as ErrorResponse
- **Frontend**: Filter out empty error pins from `BlockOutputCard`
display and accordion metadata output key counting
- **Frontend**: Clear stale execution results before dry-run/no-input
runs so the UI shows fresh output
- **Frontend**: Fix first-click simulate race condition by invalidating
execution details query after WebSocket subscription confirms

## Test plan
- [x] All 12 existing + 5 new dry-run tests pass (`poetry run pytest
backend/copilot/tools/test_dry_run.py -x -v`)
- [x] All 23 helpers tests pass (`poetry run pytest
backend/copilot/tools/helpers_test.py -x -v`)
- [x] All 13 run_block tests pass (`poetry run pytest
backend/copilot/tools/run_block_test.py -x -v`)
- [x] Backend linting passes (ruff check + format)
- [x] Frontend linting passes (next lint)
- [ ] Manual: trigger dry-run on a block with error output pin (e.g.
Komodo Image Generator) — should show "Simulated" status with clean
output, no misleading "error" section
- [ ] Manual: first click on Simulate button should immediately show
results (no race condition)

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Nicholas Tindle <nicholas.tindle@agpt.co>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
Zamil Majdy
2026-03-31 23:03:00 +02:00
committed by GitHub
parent 80581a8364
commit c659f3b058
6 changed files with 201 additions and 15 deletions

View File

@@ -114,6 +114,7 @@ async def execute_block(
error=sim_error[0],
session_id=session_id,
)
return BlockOutputResponse(
message=f"Block '{block.name}' executed successfully",
block_id=block_id,

View File

@@ -73,7 +73,10 @@ def make_openai_response(
@pytest.mark.asyncio
async def test_simulate_block_basic():
"""simulate_block returns correct (output_name, output_data) tuples."""
"""simulate_block returns correct (output_name, output_data) tuples.
Empty "error" pins are dropped at source — only non-empty errors are yielded.
"""
mock_block = make_mock_block()
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(
@@ -88,7 +91,8 @@ async def test_simulate_block_basic():
outputs.append((name, data))
assert ("result", "simulated output") in outputs
assert ("error", "") in outputs
# Empty error pin is dropped at the simulator level
assert ("error", "") not in outputs
@pytest.mark.asyncio
@@ -113,6 +117,8 @@ async def test_simulate_block_json_retry():
assert mock_client.chat.completions.create.call_count == 3
assert ("result", "ok") in outputs
# Empty error pin is dropped
assert ("error", "") not in outputs
@pytest.mark.asyncio
@@ -141,7 +147,7 @@ async def test_simulate_block_all_retries_exhausted():
@pytest.mark.asyncio
async def test_simulate_block_missing_output_pins():
"""LLM response missing some output pins; verify they're filled with None."""
"""LLM response missing some output pins; verify non-error pins filled with None."""
mock_block = make_mock_block(
output_props={
"result": {"type": "string"},
@@ -164,7 +170,29 @@ async def test_simulate_block_missing_output_pins():
assert outputs["result"] == "hello"
assert outputs["count"] is None # missing pin filled with None
assert outputs["error"] == "" # "error" pin filled with ""
assert "error" not in outputs # missing error pin is omitted entirely
@pytest.mark.asyncio
async def test_simulate_block_keeps_nonempty_error():
"""simulate_block keeps non-empty error pins (simulated logical errors)."""
mock_block = make_mock_block()
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(
return_value=make_openai_response(
'{"result": "", "error": "API rate limit exceeded"}'
)
)
with patch(
"backend.executor.simulator.get_openai_client", return_value=mock_client
):
outputs = []
async for name, data in simulate_block(mock_block, {"query": "test"}):
outputs.append((name, data))
assert ("result", "") in outputs
assert ("error", "API rate limit exceeded") in outputs
@pytest.mark.asyncio
@@ -200,6 +228,19 @@ async def test_simulate_block_truncates_long_inputs():
assert len(parsed["text"]) < 25000
def test_build_simulation_prompt_excludes_error_from_must_include():
"""The 'MUST include' prompt line should NOT list 'error' — the prompt
already instructs the LLM to OMIT error unless simulating a logical error.
Including it in 'MUST include' would be contradictory."""
block = make_mock_block() # default output_props has "result" and "error"
system_prompt, _ = build_simulation_prompt(block, {"query": "test"})
must_include_line = [
line for line in system_prompt.splitlines() if "MUST include" in line
][0]
assert '"result"' in must_include_line
assert '"error"' not in must_include_line
# ---------------------------------------------------------------------------
# execute_block dry-run tests
# ---------------------------------------------------------------------------
@@ -334,6 +375,97 @@ def test_run_block_tool_dry_run_calls_execute():
assert "dry_run=dry_run" in source_execute
@pytest.mark.asyncio
async def test_execute_block_dry_run_no_empty_error_from_simulator():
"""The simulator no longer yields empty error pins, so execute_block
simply passes through whatever the simulator produces.
Since the fix is at the simulator level, even if a simulator somehow
yields only non-error outputs, they pass through unchanged.
"""
mock_block = make_mock_block()
async def fake_simulate(block, input_data):
# Simulator now omits empty error pins at source
yield "result", "simulated output"
with patch(
"backend.copilot.tools.helpers.simulate_block", side_effect=fake_simulate
):
response = await execute_block(
block=mock_block,
block_id="test-block-id",
input_data={"query": "hello"},
user_id="user-1",
session_id="session-1",
node_exec_id="node-exec-1",
matched_credentials={},
dry_run=True,
)
assert isinstance(response, BlockOutputResponse)
assert response.success is True
assert response.is_dry_run is True
assert "error" not in response.outputs
assert response.outputs == {"result": ["simulated output"]}
@pytest.mark.asyncio
async def test_execute_block_dry_run_keeps_nonempty_error_pin():
"""Dry-run should keep the 'error' pin when it contains a real error message."""
mock_block = make_mock_block()
async def fake_simulate(block, input_data):
yield "result", ""
yield "error", "API rate limit exceeded"
with patch(
"backend.copilot.tools.helpers.simulate_block", side_effect=fake_simulate
):
response = await execute_block(
block=mock_block,
block_id="test-block-id",
input_data={"query": "hello"},
user_id="user-1",
session_id="session-1",
node_exec_id="node-exec-1",
matched_credentials={},
dry_run=True,
)
assert isinstance(response, BlockOutputResponse)
assert response.success is True
# Non-empty error should be preserved
assert "error" in response.outputs
assert response.outputs["error"] == ["API rate limit exceeded"]
@pytest.mark.asyncio
async def test_execute_block_dry_run_message_includes_completed_status():
"""Dry-run message should clearly indicate COMPLETED status."""
mock_block = make_mock_block()
async def fake_simulate(block, input_data):
yield "result", "simulated"
with patch(
"backend.copilot.tools.helpers.simulate_block", side_effect=fake_simulate
):
response = await execute_block(
block=mock_block,
block_id="test-block-id",
input_data={"query": "hello"},
user_id="user-1",
session_id="session-1",
node_exec_id="node-exec-1",
matched_credentials={},
dry_run=True,
)
assert isinstance(response, BlockOutputResponse)
assert "executed successfully" in response.message
@pytest.mark.asyncio
async def test_execute_block_dry_run_simulator_error_returns_error_response():
"""When simulate_block yields a SIMULATOR ERROR tuple, execute_block returns ErrorResponse."""

View File

@@ -13,7 +13,7 @@ Inspired by https://github.com/Significant-Gravitas/agent-simulator
import json
import logging
from collections.abc import AsyncIterator
from collections.abc import AsyncGenerator
from typing import Any
from backend.util.clients import get_openai_client
@@ -96,6 +96,10 @@ def build_simulation_prompt(block: Any, input_data: dict[str, Any]) -> tuple[str
input_pins = _describe_schema_pins(input_schema)
output_pins = _describe_schema_pins(output_schema)
output_properties = list(output_schema.get("properties", {}).keys())
# Build a separate list for the "MUST include" instruction that excludes
# "error" — the prompt already tells the LLM to OMIT the error pin unless
# simulating a logical error. Including it in "MUST include" is contradictory.
required_output_properties = [k for k in output_properties if k != "error"]
block_name = getattr(block, "name", type(block).__name__)
block_description = getattr(block, "description", "No description available.")
@@ -117,10 +121,10 @@ Rules:
- Respond with a single JSON object whose keys are EXACTLY the output pin names listed above.
- Assume all credentials and authentication are present and valid. Never simulate authentication failures.
- Make the simulated outputs realistic and consistent with the inputs.
- If there is an "error" pin, set it to "" (empty string) unless you are simulating a logical error.
- If there is an "error" pin, OMIT it entirely unless you are simulating a logical error. Only include the "error" pin when there is a genuine error message to report.
- Do not include any extra keys beyond the output pins.
Output pin names you MUST include: {json.dumps(output_properties)}
Output pin names you MUST include: {json.dumps(required_output_properties)}
"""
safe_inputs = _truncate_input_values(input_data)
@@ -132,7 +136,7 @@ Output pin names you MUST include: {json.dumps(output_properties)}
async def simulate_block(
block: Any,
input_data: dict[str, Any],
) -> AsyncIterator[tuple[str, Any]]:
) -> AsyncGenerator[tuple[str, Any], None]:
"""Simulate block execution using an LLM.
Yields (output_name, output_data) tuples matching the Block.execute() interface.
@@ -172,13 +176,26 @@ async def simulate_block(
if not isinstance(parsed, dict):
raise ValueError(f"LLM returned non-object JSON: {raw[:200]}")
# Fill missing output pins with defaults
# Fill missing output pins with defaults.
# Skip empty "error" pins — an empty string means "no error" and
# would only confuse downstream consumers (LLM, frontend).
result: dict[str, Any] = {}
for pin_name in output_properties:
if pin_name in parsed:
result[pin_name] = parsed[pin_name]
else:
result[pin_name] = "" if pin_name == "error" else None
value = parsed[pin_name]
# Drop empty/blank error pins: they carry no information.
# Uses strip() intentionally so whitespace-only strings
# (e.g. " ", "\n") are also treated as empty.
if (
pin_name == "error"
and isinstance(value, str)
and not value.strip()
):
continue
result[pin_name] = value
elif pin_name != "error":
# Only fill non-error missing pins with None
result[pin_name] = None
logger.debug(
"simulate_block: block=%s attempt=%d tokens=%s/%s",

View File

@@ -33,6 +33,12 @@ export const useRunGraph = () => {
const clearAllNodeErrors = useNodeStore(
useShallow((state) => state.clearAllNodeErrors),
);
const cleanNodesStatuses = useNodeStore(
useShallow((state) => state.cleanNodesStatuses),
);
const clearAllNodeExecutionResults = useNodeStore(
useShallow((state) => state.clearAllNodeExecutionResults),
);
// Tutorial integration - force open dialog when tutorial requests it
const forceOpenRunInputDialog = useTutorialStore(
@@ -137,6 +143,9 @@ export const useRunGraph = () => {
if (!dryRun && (hasInputs() || hasCredentials())) {
setOpenRunInputDialog(true);
} else {
// Clear stale results so the UI shows fresh output from this execution
clearAllNodeExecutionResults();
cleanNodesStatuses();
// Optimistically set running state immediately for responsive UI
setIsGraphRunning(true);
await executeGraph({

View File

@@ -10,9 +10,12 @@ import { NodeExecutionResult } from "@/app/api/__generated__/models/nodeExecutio
import { AgentExecutionStatus } from "@/app/api/__generated__/models/agentExecutionStatus";
import { useGraphStore } from "../../../stores/graphStore";
import { useEdgeStore } from "../../../stores/edgeStore";
import { useQueryClient } from "@tanstack/react-query";
import { getGetV1GetExecutionDetailsQueryKey } from "@/app/api/__generated__/endpoints/graphs/graphs";
export const useFlowRealtime = () => {
const api = useBackendAPI();
const queryClient = useQueryClient();
const updateNodeExecutionResult = useNodeStore(
useShallow((state) => state.updateNodeExecutionResult),
);
@@ -71,6 +74,16 @@ export const useFlowRealtime = () => {
console.debug(
`Subscribed to updates for execution #${flowExecutionID}`,
);
// Refetch execution details to catch any events that were
// published before the WebSocket subscription was established.
// This closes the race-condition window for fast-completing
// executions like dry-runs / simulations.
void queryClient.invalidateQueries({
queryKey: getGetV1GetExecutionDetailsQueryKey(
flowID!,
flowExecutionID,
),
});
})
.catch((error) =>
console.error(
@@ -87,7 +100,7 @@ export const useFlowRealtime = () => {
deregisterGraphExecutionStatusEvent();
resetEdgeBeads();
};
}, [api, flowExecutionID, resetEdgeBeads]);
}, [api, flowExecutionID, resetEdgeBeads, queryClient, flowID]);
return {};
};

View File

@@ -99,7 +99,19 @@ export function isRunBlockReviewRequiredOutput(
export function isRunBlockErrorOutput(
output: RunBlockToolOutput,
): output is ErrorResponse {
return output.type === ResponseType.error || "error" in output;
// Only match actual error responses (type=error), not block outputs that
// happen to have an "error" key in their outputs dict. The old
// `"error" in output` check was too broad and caused BlockOutputResponse
// to be mis-identified as errors, showing dry-run results as failed.
if (output.type === ResponseType.error) return true;
// Fallback for untyped payloads: match only if "error" exists at the top
// level AND there is no "block_id" (which distinguishes BlockOutputResponse
// from ErrorResponse). Note: `type` is optional in both interfaces, so
// correctness here depends on `block_id` presence (always set on
// BlockOutputResponse), not on `type` presence.
if (!("type" in output) && "error" in output && !("block_id" in output))
return true;
return false;
}
function parseOutput(output: unknown): RunBlockToolOutput | null {
@@ -122,7 +134,9 @@ function parseOutput(output: unknown): RunBlockToolOutput | null {
if ("block_id" in output) return output as BlockOutputResponse;
if ("block" in output) return output as BlockDetailsResponse;
if ("setup_info" in output) return output as SetupRequirementsResponse;
if ("error" in output || "details" in output)
// Only match error responses that have an "error" key but NOT "block_id"
// (which would indicate a BlockOutputResponse, not an error).
if (("error" in output || "details" in output) && !("block_id" in output))
return output as ErrorResponse;
}
return null;