mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-01-30 17:38:17 -05:00
feat: add recent_executions and improve error messages for agent generation
- Add RecentExecution model with status, correctness_score, and activity_summary - Expose recent_executions in LibraryAgent for quality assessment - Always pass error_details to user-facing messages for better debugging - Update ExecutionSummary TypedDict for sub-agent composition
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, NotRequired, TypedDict
|
||||
|
||||
from backend.api.features.library import db as library_db
|
||||
from backend.api.features.store import db as store_db
|
||||
@@ -27,15 +27,30 @@ from .service import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LibraryAgentSummary(TypedDict):
|
||||
"""Summary of a library agent for sub-agent composition."""
|
||||
class ExecutionSummary(TypedDict):
|
||||
"""Summary of a single execution for quality assessment."""
|
||||
|
||||
status: str # COMPLETED, FAILED, RUNNING, QUEUED
|
||||
correctness_score: NotRequired[float] # 0-1 score if evaluated
|
||||
activity_summary: NotRequired[str] # AI-generated summary of what happened
|
||||
|
||||
|
||||
class LibraryAgentSummary(TypedDict):
|
||||
"""Summary of a library agent for sub-agent composition.
|
||||
|
||||
Includes recent executions to help the LLM decide whether to use this agent.
|
||||
Each execution shows status, correctness_score (0-1), and activity_summary.
|
||||
"""
|
||||
|
||||
# Required fields
|
||||
graph_id: str
|
||||
graph_version: int
|
||||
name: str
|
||||
description: str
|
||||
input_schema: dict[str, Any]
|
||||
output_schema: dict[str, Any]
|
||||
# Recent executions with detailed status and quality info
|
||||
recent_executions: NotRequired[list[ExecutionSummary]]
|
||||
|
||||
|
||||
class MarketplaceAgentSummary(TypedDict):
|
||||
@@ -196,6 +211,10 @@ async def get_library_agents_for_generation(
|
||||
Uses search-based fetching to return relevant agents instead of all agents.
|
||||
This is more scalable for users with large libraries.
|
||||
|
||||
Includes recent_executions list to help the LLM assess agent quality:
|
||||
- Each execution has status, correctness_score (0-1), and activity_summary
|
||||
- This gives the LLM concrete examples of recent performance
|
||||
|
||||
Args:
|
||||
user_id: The user ID
|
||||
search_query: Optional search term to find relevant agents (user's goal/description)
|
||||
@@ -203,21 +222,16 @@ async def get_library_agents_for_generation(
|
||||
max_results: Maximum number of agents to return (default 15)
|
||||
|
||||
Returns:
|
||||
List of LibraryAgentSummary with schemas for sub-agent composition
|
||||
|
||||
Note:
|
||||
Future enhancement: Add quality filtering based on execution success rate
|
||||
or correctness_score from AgentGraphExecution stats. The current
|
||||
LibraryAgentStatus.ERROR is too aggressive (1 failed run = ERROR).
|
||||
Better approach: filter by success rate (e.g., >50% successful runs)
|
||||
or require at least 1 successful execution.
|
||||
List of LibraryAgentSummary with schemas and recent executions for sub-agent composition
|
||||
"""
|
||||
try:
|
||||
# Include executions to calculate accurate status and metrics
|
||||
response = await library_db.list_library_agents(
|
||||
user_id=user_id,
|
||||
search_term=search_query,
|
||||
page=1,
|
||||
page_size=max_results,
|
||||
include_executions=True,
|
||||
)
|
||||
|
||||
results: list[LibraryAgentSummary] = []
|
||||
@@ -225,16 +239,26 @@ async def get_library_agents_for_generation(
|
||||
if exclude_graph_id is not None and agent.graph_id == exclude_graph_id:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
LibraryAgentSummary(
|
||||
graph_id=agent.graph_id,
|
||||
graph_version=agent.graph_version,
|
||||
name=agent.name,
|
||||
description=agent.description,
|
||||
input_schema=agent.input_schema,
|
||||
output_schema=agent.output_schema,
|
||||
)
|
||||
summary = LibraryAgentSummary(
|
||||
graph_id=agent.graph_id,
|
||||
graph_version=agent.graph_version,
|
||||
name=agent.name,
|
||||
description=agent.description,
|
||||
input_schema=agent.input_schema,
|
||||
output_schema=agent.output_schema,
|
||||
)
|
||||
# Include recent executions if available
|
||||
if agent.recent_executions:
|
||||
exec_summaries: list[ExecutionSummary] = []
|
||||
for ex in agent.recent_executions:
|
||||
exec_sum = ExecutionSummary(status=ex.status)
|
||||
if ex.correctness_score is not None:
|
||||
exec_sum["correctness_score"] = ex.correctness_score
|
||||
if ex.activity_summary:
|
||||
exec_sum["activity_summary"] = ex.activity_summary
|
||||
exec_summaries.append(exec_sum)
|
||||
summary["recent_executions"] = exec_summaries
|
||||
results.append(summary)
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch library agents: {e}")
|
||||
|
||||
@@ -261,7 +261,7 @@ class CreateAgentTool(BaseTool):
|
||||
"The generated workflow had some structural issues. "
|
||||
"Please try simplifying your goal or breaking it into smaller steps."
|
||||
),
|
||||
error_details=error_msg if error_type == "validation_error" else None,
|
||||
error_details=error_msg, # Always pass error details to give users context
|
||||
)
|
||||
return ErrorResponse(
|
||||
message=user_message,
|
||||
|
||||
@@ -181,6 +181,7 @@ class EditAgentTool(BaseTool):
|
||||
operation="generate the changes",
|
||||
llm_parse_message="The AI had trouble generating the changes. Please try again or simplify your request.",
|
||||
validation_message="The generated changes failed validation. Please try rephrasing your request.",
|
||||
error_details=error_msg, # Always pass error details to give users context
|
||||
)
|
||||
return ErrorResponse(
|
||||
message=user_message,
|
||||
|
||||
@@ -39,6 +39,7 @@ async def list_library_agents(
|
||||
sort_by: library_model.LibraryAgentSort = library_model.LibraryAgentSort.UPDATED_AT,
|
||||
page: int = 1,
|
||||
page_size: int = 50,
|
||||
include_executions: bool = False,
|
||||
) -> library_model.LibraryAgentResponse:
|
||||
"""
|
||||
Retrieves a paginated list of LibraryAgent records for a given user.
|
||||
@@ -49,6 +50,9 @@ async def list_library_agents(
|
||||
sort_by: Sorting field (createdAt, updatedAt, isFavorite, isCreatedByUser).
|
||||
page: Current page (1-indexed).
|
||||
page_size: Number of items per page.
|
||||
include_executions: Whether to include execution data for status calculation.
|
||||
Defaults to False for performance (UI fetches status separately).
|
||||
Set to True when accurate status/metrics are needed (e.g., agent generator).
|
||||
|
||||
Returns:
|
||||
A LibraryAgentResponse containing the list of agents and pagination details.
|
||||
@@ -116,7 +120,7 @@ async def list_library_agents(
|
||||
library_agents = await prisma.models.LibraryAgent.prisma().find_many(
|
||||
where=where_clause,
|
||||
include=library_agent_include(
|
||||
user_id, include_nodes=False, include_executions=False
|
||||
user_id, include_nodes=False, include_executions=include_executions
|
||||
),
|
||||
order=order_by,
|
||||
skip=(page - 1) * page_size,
|
||||
|
||||
@@ -39,6 +39,18 @@ class MarketplaceListing(pydantic.BaseModel):
|
||||
creator: MarketplaceListingCreator
|
||||
|
||||
|
||||
class RecentExecution(pydantic.BaseModel):
|
||||
"""Summary of a recent execution for quality assessment.
|
||||
|
||||
Used by the LLM to understand the agent's recent performance with specific examples
|
||||
rather than just aggregate statistics.
|
||||
"""
|
||||
|
||||
status: str # COMPLETED, FAILED, RUNNING, QUEUED
|
||||
correctness_score: float | None = None # 0-1 score if evaluated
|
||||
activity_summary: str | None = None # AI-generated summary of what happened
|
||||
|
||||
|
||||
class LibraryAgent(pydantic.BaseModel):
|
||||
"""
|
||||
Represents an agent in the library, including metadata for display and
|
||||
@@ -84,6 +96,19 @@ class LibraryAgent(pydantic.BaseModel):
|
||||
# Indicates whether there's a new output (based on recent runs)
|
||||
new_output: bool
|
||||
|
||||
# Execution metrics (for quality assessment by LLM)
|
||||
execution_count: int = 0 # Number of recent executions sampled
|
||||
success_rate: float | None = (
|
||||
None # Percentage (0-100) of technically successful executions
|
||||
)
|
||||
avg_correctness_score: float | None = (
|
||||
None # 0-1 score of how well executions achieved their purpose
|
||||
)
|
||||
recent_executions: list[RecentExecution] = pydantic.Field(
|
||||
default_factory=list,
|
||||
description="List of recent executions with status, score, and summary",
|
||||
)
|
||||
|
||||
# Whether the user can access the underlying graph
|
||||
can_access_graph: bool
|
||||
|
||||
@@ -145,6 +170,50 @@ class LibraryAgent(pydantic.BaseModel):
|
||||
status = status_result.status
|
||||
new_output = status_result.new_output
|
||||
|
||||
# Calculate execution metrics
|
||||
execution_count = len(executions)
|
||||
success_rate: float | None = None
|
||||
avg_correctness_score: float | None = None
|
||||
if execution_count > 0:
|
||||
success_count = sum(
|
||||
1
|
||||
for e in executions
|
||||
if e.executionStatus == prisma.enums.AgentExecutionStatus.COMPLETED
|
||||
)
|
||||
success_rate = (success_count / execution_count) * 100
|
||||
|
||||
# Calculate average correctness score from execution stats
|
||||
correctness_scores = []
|
||||
for e in executions:
|
||||
if e.stats and isinstance(e.stats, dict):
|
||||
score = e.stats.get("correctness_score")
|
||||
if score is not None and isinstance(score, (int, float)):
|
||||
correctness_scores.append(float(score))
|
||||
if correctness_scores:
|
||||
avg_correctness_score = sum(correctness_scores) / len(
|
||||
correctness_scores
|
||||
)
|
||||
|
||||
# Build recent executions list with status, score, and summary
|
||||
recent_executions: list[RecentExecution] = []
|
||||
for e in executions:
|
||||
exec_score: float | None = None
|
||||
exec_summary: str | None = None
|
||||
if e.stats and isinstance(e.stats, dict):
|
||||
score = e.stats.get("correctness_score")
|
||||
if score is not None and isinstance(score, (int, float)):
|
||||
exec_score = float(score)
|
||||
summary = e.stats.get("activity_status")
|
||||
if summary is not None and isinstance(summary, str):
|
||||
exec_summary = summary
|
||||
recent_executions.append(
|
||||
RecentExecution(
|
||||
status=e.executionStatus.value,
|
||||
correctness_score=exec_score,
|
||||
activity_summary=exec_summary,
|
||||
)
|
||||
)
|
||||
|
||||
# Check if user can access the graph
|
||||
can_access_graph = agent.AgentGraph.userId == agent.userId
|
||||
|
||||
@@ -190,6 +259,10 @@ class LibraryAgent(pydantic.BaseModel):
|
||||
has_sensitive_action=graph.has_sensitive_action,
|
||||
trigger_setup_info=graph.trigger_setup_info,
|
||||
new_output=new_output,
|
||||
execution_count=execution_count,
|
||||
success_rate=success_rate,
|
||||
avg_correctness_score=avg_correctness_score,
|
||||
recent_executions=recent_executions,
|
||||
can_access_graph=can_access_graph,
|
||||
is_latest_version=is_latest_version,
|
||||
is_favorite=agent.isFavorite,
|
||||
|
||||
Reference in New Issue
Block a user