From 260bbb28bd715c09fb1a05f0fca0c8f01ef4e78c Mon Sep 17 00:00:00 2001 From: majdyz Date: Sat, 11 Apr 2026 12:06:47 +0000 Subject: [PATCH] fix(backend/copilot): log GraphValidationError race path for observability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both _run_agent and _schedule_agent silently swallowed the GraphValidationError that triggers the credential-race fallback and returned the inline setup card with no log. That left the race invisible to oncall — every recovered request looked identical to a cold-cache first attempt, and credential-drift rates could not be monitored. Add a logger.warning at both race catch sites that captures user_id, graph_id, and the raw node_errors so SRE can track how often the prereq check drifts from the executor/scheduler re-validation. Keeps the user-facing behaviour unchanged — still returns the inline card — but makes the race observable. --- .../backend/copilot/tools/run_agent.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/autogpt_platform/backend/backend/copilot/tools/run_agent.py b/autogpt_platform/backend/backend/copilot/tools/run_agent.py index ff1f1902d1..91f24ed302 100644 --- a/autogpt_platform/backend/backend/copilot/tools/run_agent.py +++ b/autogpt_platform/backend/backend/copilot/tools/run_agent.py @@ -575,6 +575,16 @@ class RunAgentTool(BaseTool): dry_run=dry_run, ) except GraphValidationError as e: + # Reaching here means ``_check_prerequisites`` passed but the + # executor's validator re-raised milliseconds later — surface + # the race/drift so oncall can monitor how often this fires. + logger.warning( + "Race: GraphValidationError from add_graph_execution after " + "prereq check passed (user_id=%s graph_id=%s node_errors=%s)", + user_id, + graph.id, + dict(e.node_errors), + ) creds_setup = self._build_setup_requirements_from_validation_error( graph=graph, error=e, @@ -770,6 +780,16 @@ class RunAgentTool(BaseTool): user_timezone=user_timezone, ) except GraphValidationError as e: + # Reaching here means ``_check_prerequisites`` passed but the + # scheduler's re-validation raised — surface the race/drift so + # oncall can monitor how often this fires. + logger.warning( + "Race: GraphValidationError from add_execution_schedule after " + "prereq check passed (user_id=%s graph_id=%s node_errors=%s)", + user_id, + graph.id, + dict(e.node_errors), + ) creds_setup = self._build_setup_requirements_from_validation_error( graph=graph, error=e,