From de57c992867791caa471c34920e99273cf2d032d Mon Sep 17 00:00:00 2001 From: Zamil Majdy Date: Fri, 30 Jan 2026 09:15:58 -0600 Subject: [PATCH] feat: add recent_executions and improve error messages for agent generation - Add RecentExecution model with status, correctness_score, and activity_summary - Expose recent_executions in LibraryAgent for quality assessment - Always pass error_details to user-facing messages for better debugging - Update ExecutionSummary TypedDict for sub-agent composition --- .../chat/tools/agent_generator/core.py | 64 +++++++++++----- .../api/features/chat/tools/create_agent.py | 2 +- .../api/features/chat/tools/edit_agent.py | 1 + .../backend/api/features/library/db.py | 6 +- .../backend/api/features/library/model.py | 73 +++++++++++++++++++ 5 files changed, 124 insertions(+), 22 deletions(-) diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py b/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py index 269a655c92..0ee86ad517 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py @@ -3,7 +3,7 @@ import logging import re import uuid -from typing import Any, TypedDict +from typing import Any, NotRequired, TypedDict from backend.api.features.library import db as library_db from backend.api.features.store import db as store_db @@ -27,15 +27,30 @@ from .service import ( logger = logging.getLogger(__name__) -class LibraryAgentSummary(TypedDict): - """Summary of a library agent for sub-agent composition.""" +class ExecutionSummary(TypedDict): + """Summary of a single execution for quality assessment.""" + status: str # COMPLETED, FAILED, RUNNING, QUEUED + correctness_score: NotRequired[float] # 0-1 score if evaluated + activity_summary: NotRequired[str] # AI-generated summary of what happened + + +class LibraryAgentSummary(TypedDict): + """Summary of a library agent for sub-agent composition. + + Includes recent executions to help the LLM decide whether to use this agent. + Each execution shows status, correctness_score (0-1), and activity_summary. + """ + + # Required fields graph_id: str graph_version: int name: str description: str input_schema: dict[str, Any] output_schema: dict[str, Any] + # Recent executions with detailed status and quality info + recent_executions: NotRequired[list[ExecutionSummary]] class MarketplaceAgentSummary(TypedDict): @@ -196,6 +211,10 @@ async def get_library_agents_for_generation( Uses search-based fetching to return relevant agents instead of all agents. This is more scalable for users with large libraries. + Includes recent_executions list to help the LLM assess agent quality: + - Each execution has status, correctness_score (0-1), and activity_summary + - This gives the LLM concrete examples of recent performance + Args: user_id: The user ID search_query: Optional search term to find relevant agents (user's goal/description) @@ -203,21 +222,16 @@ async def get_library_agents_for_generation( max_results: Maximum number of agents to return (default 15) Returns: - List of LibraryAgentSummary with schemas for sub-agent composition - - Note: - Future enhancement: Add quality filtering based on execution success rate - or correctness_score from AgentGraphExecution stats. The current - LibraryAgentStatus.ERROR is too aggressive (1 failed run = ERROR). - Better approach: filter by success rate (e.g., >50% successful runs) - or require at least 1 successful execution. + List of LibraryAgentSummary with schemas and recent executions for sub-agent composition """ try: + # Include executions to calculate accurate status and metrics response = await library_db.list_library_agents( user_id=user_id, search_term=search_query, page=1, page_size=max_results, + include_executions=True, ) results: list[LibraryAgentSummary] = [] @@ -225,16 +239,26 @@ async def get_library_agents_for_generation( if exclude_graph_id is not None and agent.graph_id == exclude_graph_id: continue - results.append( - LibraryAgentSummary( - graph_id=agent.graph_id, - graph_version=agent.graph_version, - name=agent.name, - description=agent.description, - input_schema=agent.input_schema, - output_schema=agent.output_schema, - ) + summary = LibraryAgentSummary( + graph_id=agent.graph_id, + graph_version=agent.graph_version, + name=agent.name, + description=agent.description, + input_schema=agent.input_schema, + output_schema=agent.output_schema, ) + # Include recent executions if available + if agent.recent_executions: + exec_summaries: list[ExecutionSummary] = [] + for ex in agent.recent_executions: + exec_sum = ExecutionSummary(status=ex.status) + if ex.correctness_score is not None: + exec_sum["correctness_score"] = ex.correctness_score + if ex.activity_summary: + exec_sum["activity_summary"] = ex.activity_summary + exec_summaries.append(exec_sum) + summary["recent_executions"] = exec_summaries + results.append(summary) return results except Exception as e: logger.warning(f"Failed to fetch library agents: {e}") diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py b/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py index 71221e1082..17478b2d13 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py @@ -261,7 +261,7 @@ class CreateAgentTool(BaseTool): "The generated workflow had some structural issues. " "Please try simplifying your goal or breaking it into smaller steps." ), - error_details=error_msg if error_type == "validation_error" else None, + error_details=error_msg, # Always pass error details to give users context ) return ErrorResponse( message=user_message, diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py b/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py index 0a715ee62e..f57cd69574 100644 --- a/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py +++ b/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py @@ -181,6 +181,7 @@ class EditAgentTool(BaseTool): operation="generate the changes", llm_parse_message="The AI had trouble generating the changes. Please try again or simplify your request.", validation_message="The generated changes failed validation. Please try rephrasing your request.", + error_details=error_msg, # Always pass error details to give users context ) return ErrorResponse( message=user_message, diff --git a/autogpt_platform/backend/backend/api/features/library/db.py b/autogpt_platform/backend/backend/api/features/library/db.py index d613e4ae2c..c154cfc400 100644 --- a/autogpt_platform/backend/backend/api/features/library/db.py +++ b/autogpt_platform/backend/backend/api/features/library/db.py @@ -39,6 +39,7 @@ async def list_library_agents( sort_by: library_model.LibraryAgentSort = library_model.LibraryAgentSort.UPDATED_AT, page: int = 1, page_size: int = 50, + include_executions: bool = False, ) -> library_model.LibraryAgentResponse: """ Retrieves a paginated list of LibraryAgent records for a given user. @@ -49,6 +50,9 @@ async def list_library_agents( sort_by: Sorting field (createdAt, updatedAt, isFavorite, isCreatedByUser). page: Current page (1-indexed). page_size: Number of items per page. + include_executions: Whether to include execution data for status calculation. + Defaults to False for performance (UI fetches status separately). + Set to True when accurate status/metrics are needed (e.g., agent generator). Returns: A LibraryAgentResponse containing the list of agents and pagination details. @@ -116,7 +120,7 @@ async def list_library_agents( library_agents = await prisma.models.LibraryAgent.prisma().find_many( where=where_clause, include=library_agent_include( - user_id, include_nodes=False, include_executions=False + user_id, include_nodes=False, include_executions=include_executions ), order=order_by, skip=(page - 1) * page_size, diff --git a/autogpt_platform/backend/backend/api/features/library/model.py b/autogpt_platform/backend/backend/api/features/library/model.py index 14d7c7be81..c1e41248fb 100644 --- a/autogpt_platform/backend/backend/api/features/library/model.py +++ b/autogpt_platform/backend/backend/api/features/library/model.py @@ -39,6 +39,18 @@ class MarketplaceListing(pydantic.BaseModel): creator: MarketplaceListingCreator +class RecentExecution(pydantic.BaseModel): + """Summary of a recent execution for quality assessment. + + Used by the LLM to understand the agent's recent performance with specific examples + rather than just aggregate statistics. + """ + + status: str # COMPLETED, FAILED, RUNNING, QUEUED + correctness_score: float | None = None # 0-1 score if evaluated + activity_summary: str | None = None # AI-generated summary of what happened + + class LibraryAgent(pydantic.BaseModel): """ Represents an agent in the library, including metadata for display and @@ -84,6 +96,19 @@ class LibraryAgent(pydantic.BaseModel): # Indicates whether there's a new output (based on recent runs) new_output: bool + # Execution metrics (for quality assessment by LLM) + execution_count: int = 0 # Number of recent executions sampled + success_rate: float | None = ( + None # Percentage (0-100) of technically successful executions + ) + avg_correctness_score: float | None = ( + None # 0-1 score of how well executions achieved their purpose + ) + recent_executions: list[RecentExecution] = pydantic.Field( + default_factory=list, + description="List of recent executions with status, score, and summary", + ) + # Whether the user can access the underlying graph can_access_graph: bool @@ -145,6 +170,50 @@ class LibraryAgent(pydantic.BaseModel): status = status_result.status new_output = status_result.new_output + # Calculate execution metrics + execution_count = len(executions) + success_rate: float | None = None + avg_correctness_score: float | None = None + if execution_count > 0: + success_count = sum( + 1 + for e in executions + if e.executionStatus == prisma.enums.AgentExecutionStatus.COMPLETED + ) + success_rate = (success_count / execution_count) * 100 + + # Calculate average correctness score from execution stats + correctness_scores = [] + for e in executions: + if e.stats and isinstance(e.stats, dict): + score = e.stats.get("correctness_score") + if score is not None and isinstance(score, (int, float)): + correctness_scores.append(float(score)) + if correctness_scores: + avg_correctness_score = sum(correctness_scores) / len( + correctness_scores + ) + + # Build recent executions list with status, score, and summary + recent_executions: list[RecentExecution] = [] + for e in executions: + exec_score: float | None = None + exec_summary: str | None = None + if e.stats and isinstance(e.stats, dict): + score = e.stats.get("correctness_score") + if score is not None and isinstance(score, (int, float)): + exec_score = float(score) + summary = e.stats.get("activity_status") + if summary is not None and isinstance(summary, str): + exec_summary = summary + recent_executions.append( + RecentExecution( + status=e.executionStatus.value, + correctness_score=exec_score, + activity_summary=exec_summary, + ) + ) + # Check if user can access the graph can_access_graph = agent.AgentGraph.userId == agent.userId @@ -190,6 +259,10 @@ class LibraryAgent(pydantic.BaseModel): has_sensitive_action=graph.has_sensitive_action, trigger_setup_info=graph.trigger_setup_info, new_output=new_output, + execution_count=execution_count, + success_rate=success_rate, + avg_correctness_score=avg_correctness_score, + recent_executions=recent_executions, can_access_graph=can_access_graph, is_latest_version=is_latest_version, is_favorite=agent.isFavorite,