From de57c992867791caa471c34920e99273cf2d032d Mon Sep 17 00:00:00 2001
From: Zamil Majdy <zamil.majdy@agpt.co>
Date: Fri, 30 Jan 2026 09:15:58 -0600
Subject: [PATCH] feat: add recent_executions and improve error messages for
 agent generation

- Add RecentExecution model with status, correctness_score, and activity_summary
- Expose recent_executions in LibraryAgent for quality assessment
- Always pass error_details to user-facing messages for better debugging
- Update ExecutionSummary TypedDict for sub-agent composition
---
 .../chat/tools/agent_generator/core.py        | 64 +++++++++++-----
 .../api/features/chat/tools/create_agent.py   |  2 +-
 .../api/features/chat/tools/edit_agent.py     |  1 +
 .../backend/api/features/library/db.py        |  6 +-
 .../backend/api/features/library/model.py     | 73 +++++++++++++++++++
 5 files changed, 124 insertions(+), 22 deletions(-)

diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py b/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py
index 269a655c92..0ee86ad517 100644
--- a/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py
@@ -3,7 +3,7 @@
 import logging
 import re
 import uuid
-from typing import Any, TypedDict
+from typing import Any, NotRequired, TypedDict
 
 from backend.api.features.library import db as library_db
 from backend.api.features.store import db as store_db
@@ -27,15 +27,30 @@ from .service import (
 logger = logging.getLogger(__name__)
 
 
-class LibraryAgentSummary(TypedDict):
-    """Summary of a library agent for sub-agent composition."""
+class ExecutionSummary(TypedDict):
+    """Summary of a single execution for quality assessment."""
 
+    status: str  # COMPLETED, FAILED, RUNNING, QUEUED
+    correctness_score: NotRequired[float]  # 0-1 score if evaluated
+    activity_summary: NotRequired[str]  # AI-generated summary of what happened
+
+
+class LibraryAgentSummary(TypedDict):
+    """Summary of a library agent for sub-agent composition.
+
+    Includes recent executions to help the LLM decide whether to use this agent.
+    Each execution shows status, correctness_score (0-1), and activity_summary.
+    """
+
+    # Required fields
     graph_id: str
     graph_version: int
     name: str
     description: str
     input_schema: dict[str, Any]
     output_schema: dict[str, Any]
+    # Recent executions with detailed status and quality info
+    recent_executions: NotRequired[list[ExecutionSummary]]
 
 
 class MarketplaceAgentSummary(TypedDict):
@@ -196,6 +211,10 @@ async def get_library_agents_for_generation(
     Uses search-based fetching to return relevant agents instead of all agents.
     This is more scalable for users with large libraries.
 
+    Includes recent_executions list to help the LLM assess agent quality:
+    - Each execution has status, correctness_score (0-1), and activity_summary
+    - This gives the LLM concrete examples of recent performance
+
     Args:
         user_id: The user ID
         search_query: Optional search term to find relevant agents (user's goal/description)
@@ -203,21 +222,16 @@ async def get_library_agents_for_generation(
         max_results: Maximum number of agents to return (default 15)
 
     Returns:
-        List of LibraryAgentSummary with schemas for sub-agent composition
-
-    Note:
-        Future enhancement: Add quality filtering based on execution success rate
-        or correctness_score from AgentGraphExecution stats. The current
-        LibraryAgentStatus.ERROR is too aggressive (1 failed run = ERROR).
-        Better approach: filter by success rate (e.g., >50% successful runs)
-        or require at least 1 successful execution.
+        List of LibraryAgentSummary with schemas and recent executions for sub-agent composition
     """
     try:
+        # Include executions to calculate accurate status and metrics
         response = await library_db.list_library_agents(
             user_id=user_id,
             search_term=search_query,
             page=1,
             page_size=max_results,
+            include_executions=True,
         )
 
         results: list[LibraryAgentSummary] = []
@@ -225,16 +239,26 @@ async def get_library_agents_for_generation(
             if exclude_graph_id is not None and agent.graph_id == exclude_graph_id:
                 continue
 
-            results.append(
-                LibraryAgentSummary(
-                    graph_id=agent.graph_id,
-                    graph_version=agent.graph_version,
-                    name=agent.name,
-                    description=agent.description,
-                    input_schema=agent.input_schema,
-                    output_schema=agent.output_schema,
-                )
+            summary = LibraryAgentSummary(
+                graph_id=agent.graph_id,
+                graph_version=agent.graph_version,
+                name=agent.name,
+                description=agent.description,
+                input_schema=agent.input_schema,
+                output_schema=agent.output_schema,
             )
+            # Include recent executions if available
+            if agent.recent_executions:
+                exec_summaries: list[ExecutionSummary] = []
+                for ex in agent.recent_executions:
+                    exec_sum = ExecutionSummary(status=ex.status)
+                    if ex.correctness_score is not None:
+                        exec_sum["correctness_score"] = ex.correctness_score
+                    if ex.activity_summary:
+                        exec_sum["activity_summary"] = ex.activity_summary
+                    exec_summaries.append(exec_sum)
+                summary["recent_executions"] = exec_summaries
+            results.append(summary)
         return results
     except Exception as e:
         logger.warning(f"Failed to fetch library agents: {e}")
diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py b/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py
index 71221e1082..17478b2d13 100644
--- a/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py
@@ -261,7 +261,7 @@ class CreateAgentTool(BaseTool):
                     "The generated workflow had some structural issues. "
                     "Please try simplifying your goal or breaking it into smaller steps."
                 ),
-                error_details=error_msg if error_type == "validation_error" else None,
+                error_details=error_msg,  # Always pass error details to give users context
             )
             return ErrorResponse(
                 message=user_message,
diff --git a/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py b/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
index 0a715ee62e..f57cd69574 100644
--- a/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
@@ -181,6 +181,7 @@ class EditAgentTool(BaseTool):
                 operation="generate the changes",
                 llm_parse_message="The AI had trouble generating the changes. Please try again or simplify your request.",
                 validation_message="The generated changes failed validation. Please try rephrasing your request.",
+                error_details=error_msg,  # Always pass error details to give users context
             )
             return ErrorResponse(
                 message=user_message,
diff --git a/autogpt_platform/backend/backend/api/features/library/db.py b/autogpt_platform/backend/backend/api/features/library/db.py
index d613e4ae2c..c154cfc400 100644
--- a/autogpt_platform/backend/backend/api/features/library/db.py
+++ b/autogpt_platform/backend/backend/api/features/library/db.py
@@ -39,6 +39,7 @@ async def list_library_agents(
     sort_by: library_model.LibraryAgentSort = library_model.LibraryAgentSort.UPDATED_AT,
     page: int = 1,
     page_size: int = 50,
+    include_executions: bool = False,
 ) -> library_model.LibraryAgentResponse:
     """
     Retrieves a paginated list of LibraryAgent records for a given user.
@@ -49,6 +50,9 @@ async def list_library_agents(
         sort_by: Sorting field (createdAt, updatedAt, isFavorite, isCreatedByUser).
         page: Current page (1-indexed).
         page_size: Number of items per page.
+        include_executions: Whether to include execution data for status calculation.
+            Defaults to False for performance (UI fetches status separately).
+            Set to True when accurate status/metrics are needed (e.g., agent generator).
 
     Returns:
         A LibraryAgentResponse containing the list of agents and pagination details.
@@ -116,7 +120,7 @@ async def list_library_agents(
         library_agents = await prisma.models.LibraryAgent.prisma().find_many(
             where=where_clause,
             include=library_agent_include(
-                user_id, include_nodes=False, include_executions=False
+                user_id, include_nodes=False, include_executions=include_executions
             ),
             order=order_by,
             skip=(page - 1) * page_size,
diff --git a/autogpt_platform/backend/backend/api/features/library/model.py b/autogpt_platform/backend/backend/api/features/library/model.py
index 14d7c7be81..c1e41248fb 100644
--- a/autogpt_platform/backend/backend/api/features/library/model.py
+++ b/autogpt_platform/backend/backend/api/features/library/model.py
@@ -39,6 +39,18 @@ class MarketplaceListing(pydantic.BaseModel):
     creator: MarketplaceListingCreator
 
 
+class RecentExecution(pydantic.BaseModel):
+    """Summary of a recent execution for quality assessment.
+
+    Used by the LLM to understand the agent's recent performance with specific examples
+    rather than just aggregate statistics.
+    """
+
+    status: str  # COMPLETED, FAILED, RUNNING, QUEUED
+    correctness_score: float | None = None  # 0-1 score if evaluated
+    activity_summary: str | None = None  # AI-generated summary of what happened
+
+
 class LibraryAgent(pydantic.BaseModel):
     """
     Represents an agent in the library, including metadata for display and
@@ -84,6 +96,19 @@ class LibraryAgent(pydantic.BaseModel):
     # Indicates whether there's a new output (based on recent runs)
     new_output: bool
 
+    # Execution metrics (for quality assessment by LLM)
+    execution_count: int = 0  # Number of recent executions sampled
+    success_rate: float | None = (
+        None  # Percentage (0-100) of technically successful executions
+    )
+    avg_correctness_score: float | None = (
+        None  # 0-1 score of how well executions achieved their purpose
+    )
+    recent_executions: list[RecentExecution] = pydantic.Field(
+        default_factory=list,
+        description="List of recent executions with status, score, and summary",
+    )
+
     # Whether the user can access the underlying graph
     can_access_graph: bool
 
@@ -145,6 +170,50 @@ class LibraryAgent(pydantic.BaseModel):
         status = status_result.status
         new_output = status_result.new_output
 
+        # Calculate execution metrics
+        execution_count = len(executions)
+        success_rate: float | None = None
+        avg_correctness_score: float | None = None
+        if execution_count > 0:
+            success_count = sum(
+                1
+                for e in executions
+                if e.executionStatus == prisma.enums.AgentExecutionStatus.COMPLETED
+            )
+            success_rate = (success_count / execution_count) * 100
+
+            # Calculate average correctness score from execution stats
+            correctness_scores = []
+            for e in executions:
+                if e.stats and isinstance(e.stats, dict):
+                    score = e.stats.get("correctness_score")
+                    if score is not None and isinstance(score, (int, float)):
+                        correctness_scores.append(float(score))
+            if correctness_scores:
+                avg_correctness_score = sum(correctness_scores) / len(
+                    correctness_scores
+                )
+
+        # Build recent executions list with status, score, and summary
+        recent_executions: list[RecentExecution] = []
+        for e in executions:
+            exec_score: float | None = None
+            exec_summary: str | None = None
+            if e.stats and isinstance(e.stats, dict):
+                score = e.stats.get("correctness_score")
+                if score is not None and isinstance(score, (int, float)):
+                    exec_score = float(score)
+                summary = e.stats.get("activity_status")
+                if summary is not None and isinstance(summary, str):
+                    exec_summary = summary
+            recent_executions.append(
+                RecentExecution(
+                    status=e.executionStatus.value,
+                    correctness_score=exec_score,
+                    activity_summary=exec_summary,
+                )
+            )
+
         # Check if user can access the graph
         can_access_graph = agent.AgentGraph.userId == agent.userId
 
@@ -190,6 +259,10 @@ class LibraryAgent(pydantic.BaseModel):
             has_sensitive_action=graph.has_sensitive_action,
             trigger_setup_info=graph.trigger_setup_info,
             new_output=new_output,
+            execution_count=execution_count,
+            success_rate=success_rate,
+            avg_correctness_score=avg_correctness_score,
+            recent_executions=recent_executions,
             can_access_graph=can_access_graph,
             is_latest_version=is_latest_version,
             is_favorite=agent.isFavorite,