feat: add recent_executions and improve error messages for agent generation

- Add RecentExecution model with status, correctness_score, and activity_summary - Expose recent_executions in LibraryAgent for quality assessment - Always pass error_details to user-facing messages for better debugging - Update ExecutionSummary TypedDict for sub-agent composition
2026-01-30 17:38:17 -05:00 · 2026-01-30 09:15:58 -06:00
parent 1ad8fde75d
commit de57c99286
5 changed files with 124 additions and 22 deletions
--- a/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/agent_generator/core.py
@@ -3,7 +3,7 @@
 import logging
 import re
 import uuid
-from typing import Any, TypedDict
+from typing import Any, NotRequired, TypedDict

 from backend.api.features.library import db as library_db
 from backend.api.features.store import db as store_db
@@ -27,15 +27,30 @@ from .service import (
 logger = logging.getLogger(__name__)


-class LibraryAgentSummary(TypedDict):
-    """Summary of a library agent for sub-agent composition."""
+class ExecutionSummary(TypedDict):
+    """Summary of a single execution for quality assessment."""

+    status: str  # COMPLETED, FAILED, RUNNING, QUEUED
+    correctness_score: NotRequired[float]  # 0-1 score if evaluated
+    activity_summary: NotRequired[str]  # AI-generated summary of what happened
+
+
+class LibraryAgentSummary(TypedDict):
+    """Summary of a library agent for sub-agent composition.
+
+    Includes recent executions to help the LLM decide whether to use this agent.
+    Each execution shows status, correctness_score (0-1), and activity_summary.
+    """
+
+    # Required fields
    graph_id: str
    graph_version: int
    name: str
    description: str
    input_schema: dict[str, Any]
    output_schema: dict[str, Any]
+    # Recent executions with detailed status and quality info
+    recent_executions: NotRequired[list[ExecutionSummary]]


 class MarketplaceAgentSummary(TypedDict):
@@ -196,6 +211,10 @@ async def get_library_agents_for_generation(
    Uses search-based fetching to return relevant agents instead of all agents.
    This is more scalable for users with large libraries.

+    Includes recent_executions list to help the LLM assess agent quality:
+    - Each execution has status, correctness_score (0-1), and activity_summary
+    - This gives the LLM concrete examples of recent performance
+
    Args:
        user_id: The user ID
        search_query: Optional search term to find relevant agents (user's goal/description)
@@ -203,21 +222,16 @@ async def get_library_agents_for_generation(
        max_results: Maximum number of agents to return (default 15)

    Returns:
-        List of LibraryAgentSummary with schemas for sub-agent composition
-
-    Note:
-        Future enhancement: Add quality filtering based on execution success rate
-        or correctness_score from AgentGraphExecution stats. The current
-        LibraryAgentStatus.ERROR is too aggressive (1 failed run = ERROR).
-        Better approach: filter by success rate (e.g., >50% successful runs)
-        or require at least 1 successful execution.
+        List of LibraryAgentSummary with schemas and recent executions for sub-agent composition
    """
    try:
+        # Include executions to calculate accurate status and metrics
        response = await library_db.list_library_agents(
            user_id=user_id,
            search_term=search_query,
            page=1,
            page_size=max_results,
+            include_executions=True,
        )

        results: list[LibraryAgentSummary] = []
@@ -225,16 +239,26 @@ async def get_library_agents_for_generation(
            if exclude_graph_id is not None and agent.graph_id == exclude_graph_id:
                continue

-            results.append(
-                LibraryAgentSummary(
-                    graph_id=agent.graph_id,
-                    graph_version=agent.graph_version,
-                    name=agent.name,
-                    description=agent.description,
-                    input_schema=agent.input_schema,
-                    output_schema=agent.output_schema,
-                )
+            summary = LibraryAgentSummary(
+                graph_id=agent.graph_id,
+                graph_version=agent.graph_version,
+                name=agent.name,
+                description=agent.description,
+                input_schema=agent.input_schema,
+                output_schema=agent.output_schema,
            )
+            # Include recent executions if available
+            if agent.recent_executions:
+                exec_summaries: list[ExecutionSummary] = []
+                for ex in agent.recent_executions:
+                    exec_sum = ExecutionSummary(status=ex.status)
+                    if ex.correctness_score is not None:
+                        exec_sum["correctness_score"] = ex.correctness_score
+                    if ex.activity_summary:
+                        exec_sum["activity_summary"] = ex.activity_summary
+                    exec_summaries.append(exec_sum)
+                summary["recent_executions"] = exec_summaries
+            results.append(summary)
        return results
    except Exception as e:
        logger.warning(f"Failed to fetch library agents: {e}")
--- a/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/create_agent.py
@@ -261,7 +261,7 @@ class CreateAgentTool(BaseTool):
                    "The generated workflow had some structural issues. "
                    "Please try simplifying your goal or breaking it into smaller steps."
                ),
-                error_details=error_msg if error_type == "validation_error" else None,
+                error_details=error_msg,  # Always pass error details to give users context
            )
            return ErrorResponse(
                message=user_message,
--- a/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
+++ b/autogpt_platform/backend/backend/api/features/chat/tools/edit_agent.py
@@ -181,6 +181,7 @@ class EditAgentTool(BaseTool):
                operation="generate the changes",
                llm_parse_message="The AI had trouble generating the changes. Please try again or simplify your request.",
                validation_message="The generated changes failed validation. Please try rephrasing your request.",
+                error_details=error_msg,  # Always pass error details to give users context
            )
            return ErrorResponse(
                message=user_message,
--- a/autogpt_platform/backend/backend/api/features/library/db.py
+++ b/autogpt_platform/backend/backend/api/features/library/db.py
@@ -39,6 +39,7 @@ async def list_library_agents(
    sort_by: library_model.LibraryAgentSort = library_model.LibraryAgentSort.UPDATED_AT,
    page: int = 1,
    page_size: int = 50,
+    include_executions: bool = False,
 ) -> library_model.LibraryAgentResponse:
    """
    Retrieves a paginated list of LibraryAgent records for a given user.
@@ -49,6 +50,9 @@ async def list_library_agents(
        sort_by: Sorting field (createdAt, updatedAt, isFavorite, isCreatedByUser).
        page: Current page (1-indexed).
        page_size: Number of items per page.
+        include_executions: Whether to include execution data for status calculation.
+            Defaults to False for performance (UI fetches status separately).
+            Set to True when accurate status/metrics are needed (e.g., agent generator).

    Returns:
        A LibraryAgentResponse containing the list of agents and pagination details.
@@ -116,7 +120,7 @@ async def list_library_agents(
        library_agents = await prisma.models.LibraryAgent.prisma().find_many(
            where=where_clause,
            include=library_agent_include(
-                user_id, include_nodes=False, include_executions=False
+                user_id, include_nodes=False, include_executions=include_executions
            ),
            order=order_by,
            skip=(page - 1) * page_size,
--- a/autogpt_platform/backend/backend/api/features/library/model.py
+++ b/autogpt_platform/backend/backend/api/features/library/model.py
@@ -39,6 +39,18 @@ class MarketplaceListing(pydantic.BaseModel):
    creator: MarketplaceListingCreator


+class RecentExecution(pydantic.BaseModel):
+    """Summary of a recent execution for quality assessment.
+
+    Used by the LLM to understand the agent's recent performance with specific examples
+    rather than just aggregate statistics.
+    """
+
+    status: str  # COMPLETED, FAILED, RUNNING, QUEUED
+    correctness_score: float | None = None  # 0-1 score if evaluated
+    activity_summary: str | None = None  # AI-generated summary of what happened
+
+
 class LibraryAgent(pydantic.BaseModel):
    """
    Represents an agent in the library, including metadata for display and
@@ -84,6 +96,19 @@ class LibraryAgent(pydantic.BaseModel):
    # Indicates whether there's a new output (based on recent runs)
    new_output: bool

+    # Execution metrics (for quality assessment by LLM)
+    execution_count: int = 0  # Number of recent executions sampled
+    success_rate: float | None = (
+        None  # Percentage (0-100) of technically successful executions
+    )
+    avg_correctness_score: float | None = (
+        None  # 0-1 score of how well executions achieved their purpose
+    )
+    recent_executions: list[RecentExecution] = pydantic.Field(
+        default_factory=list,
+        description="List of recent executions with status, score, and summary",
+    )
+
    # Whether the user can access the underlying graph
    can_access_graph: bool

@@ -145,6 +170,50 @@ class LibraryAgent(pydantic.BaseModel):
        status = status_result.status
        new_output = status_result.new_output

+        # Calculate execution metrics
+        execution_count = len(executions)
+        success_rate: float | None = None
+        avg_correctness_score: float | None = None
+        if execution_count > 0:
+            success_count = sum(
+                1
+                for e in executions
+                if e.executionStatus == prisma.enums.AgentExecutionStatus.COMPLETED
+            )
+            success_rate = (success_count / execution_count) * 100
+
+            # Calculate average correctness score from execution stats
+            correctness_scores = []
+            for e in executions:
+                if e.stats and isinstance(e.stats, dict):
+                    score = e.stats.get("correctness_score")
+                    if score is not None and isinstance(score, (int, float)):
+                        correctness_scores.append(float(score))
+            if correctness_scores:
+                avg_correctness_score = sum(correctness_scores) / len(
+                    correctness_scores
+                )
+
+        # Build recent executions list with status, score, and summary
+        recent_executions: list[RecentExecution] = []
+        for e in executions:
+            exec_score: float | None = None
+            exec_summary: str | None = None
+            if e.stats and isinstance(e.stats, dict):
+                score = e.stats.get("correctness_score")
+                if score is not None and isinstance(score, (int, float)):
+                    exec_score = float(score)
+                summary = e.stats.get("activity_status")
+                if summary is not None and isinstance(summary, str):
+                    exec_summary = summary
+            recent_executions.append(
+                RecentExecution(
+                    status=e.executionStatus.value,
+                    correctness_score=exec_score,
+                    activity_summary=exec_summary,
+                )
+            )
+
        # Check if user can access the graph
        can_access_graph = agent.AgentGraph.userId == agent.userId

@@ -190,6 +259,10 @@ class LibraryAgent(pydantic.BaseModel):
            has_sensitive_action=graph.has_sensitive_action,
            trigger_setup_info=graph.trigger_setup_info,
            new_output=new_output,
+            execution_count=execution_count,
+            success_rate=success_rate,
+            avg_correctness_score=avg_correctness_score,
+            recent_executions=recent_executions,
            can_access_graph=can_access_graph,
            is_latest_version=is_latest_version,
            is_favorite=agent.isFavorite,