feat: add recent_executions and improve error messages for agent generation

- Add RecentExecution model with status, correctness_score, and activity_summary
- Expose recent_executions in LibraryAgent for quality assessment
- Always pass error_details to user-facing messages for better debugging
- Update ExecutionSummary TypedDict for sub-agent composition
This commit is contained in:
Zamil Majdy
2026-01-30 09:15:58 -06:00
parent 1ad8fde75d
commit de57c99286
5 changed files with 124 additions and 22 deletions

View File

@@ -3,7 +3,7 @@
import logging
import re
import uuid
from typing import Any, TypedDict
from typing import Any, NotRequired, TypedDict
from backend.api.features.library import db as library_db
from backend.api.features.store import db as store_db
@@ -27,15 +27,30 @@ from .service import (
logger = logging.getLogger(__name__)
class LibraryAgentSummary(TypedDict):
"""Summary of a library agent for sub-agent composition."""
class ExecutionSummary(TypedDict):
"""Summary of a single execution for quality assessment."""
status: str # COMPLETED, FAILED, RUNNING, QUEUED
correctness_score: NotRequired[float] # 0-1 score if evaluated
activity_summary: NotRequired[str] # AI-generated summary of what happened
class LibraryAgentSummary(TypedDict):
"""Summary of a library agent for sub-agent composition.
Includes recent executions to help the LLM decide whether to use this agent.
Each execution shows status, correctness_score (0-1), and activity_summary.
"""
# Required fields
graph_id: str
graph_version: int
name: str
description: str
input_schema: dict[str, Any]
output_schema: dict[str, Any]
# Recent executions with detailed status and quality info
recent_executions: NotRequired[list[ExecutionSummary]]
class MarketplaceAgentSummary(TypedDict):
@@ -196,6 +211,10 @@ async def get_library_agents_for_generation(
Uses search-based fetching to return relevant agents instead of all agents.
This is more scalable for users with large libraries.
Includes recent_executions list to help the LLM assess agent quality:
- Each execution has status, correctness_score (0-1), and activity_summary
- This gives the LLM concrete examples of recent performance
Args:
user_id: The user ID
search_query: Optional search term to find relevant agents (user's goal/description)
@@ -203,21 +222,16 @@ async def get_library_agents_for_generation(
max_results: Maximum number of agents to return (default 15)
Returns:
List of LibraryAgentSummary with schemas for sub-agent composition
Note:
Future enhancement: Add quality filtering based on execution success rate
or correctness_score from AgentGraphExecution stats. The current
LibraryAgentStatus.ERROR is too aggressive (1 failed run = ERROR).
Better approach: filter by success rate (e.g., >50% successful runs)
or require at least 1 successful execution.
List of LibraryAgentSummary with schemas and recent executions for sub-agent composition
"""
try:
# Include executions to calculate accurate status and metrics
response = await library_db.list_library_agents(
user_id=user_id,
search_term=search_query,
page=1,
page_size=max_results,
include_executions=True,
)
results: list[LibraryAgentSummary] = []
@@ -225,16 +239,26 @@ async def get_library_agents_for_generation(
if exclude_graph_id is not None and agent.graph_id == exclude_graph_id:
continue
results.append(
LibraryAgentSummary(
graph_id=agent.graph_id,
graph_version=agent.graph_version,
name=agent.name,
description=agent.description,
input_schema=agent.input_schema,
output_schema=agent.output_schema,
)
summary = LibraryAgentSummary(
graph_id=agent.graph_id,
graph_version=agent.graph_version,
name=agent.name,
description=agent.description,
input_schema=agent.input_schema,
output_schema=agent.output_schema,
)
# Include recent executions if available
if agent.recent_executions:
exec_summaries: list[ExecutionSummary] = []
for ex in agent.recent_executions:
exec_sum = ExecutionSummary(status=ex.status)
if ex.correctness_score is not None:
exec_sum["correctness_score"] = ex.correctness_score
if ex.activity_summary:
exec_sum["activity_summary"] = ex.activity_summary
exec_summaries.append(exec_sum)
summary["recent_executions"] = exec_summaries
results.append(summary)
return results
except Exception as e:
logger.warning(f"Failed to fetch library agents: {e}")

View File

@@ -261,7 +261,7 @@ class CreateAgentTool(BaseTool):
"The generated workflow had some structural issues. "
"Please try simplifying your goal or breaking it into smaller steps."
),
error_details=error_msg if error_type == "validation_error" else None,
error_details=error_msg, # Always pass error details to give users context
)
return ErrorResponse(
message=user_message,

View File

@@ -181,6 +181,7 @@ class EditAgentTool(BaseTool):
operation="generate the changes",
llm_parse_message="The AI had trouble generating the changes. Please try again or simplify your request.",
validation_message="The generated changes failed validation. Please try rephrasing your request.",
error_details=error_msg, # Always pass error details to give users context
)
return ErrorResponse(
message=user_message,

View File

@@ -39,6 +39,7 @@ async def list_library_agents(
sort_by: library_model.LibraryAgentSort = library_model.LibraryAgentSort.UPDATED_AT,
page: int = 1,
page_size: int = 50,
include_executions: bool = False,
) -> library_model.LibraryAgentResponse:
"""
Retrieves a paginated list of LibraryAgent records for a given user.
@@ -49,6 +50,9 @@ async def list_library_agents(
sort_by: Sorting field (createdAt, updatedAt, isFavorite, isCreatedByUser).
page: Current page (1-indexed).
page_size: Number of items per page.
include_executions: Whether to include execution data for status calculation.
Defaults to False for performance (UI fetches status separately).
Set to True when accurate status/metrics are needed (e.g., agent generator).
Returns:
A LibraryAgentResponse containing the list of agents and pagination details.
@@ -116,7 +120,7 @@ async def list_library_agents(
library_agents = await prisma.models.LibraryAgent.prisma().find_many(
where=where_clause,
include=library_agent_include(
user_id, include_nodes=False, include_executions=False
user_id, include_nodes=False, include_executions=include_executions
),
order=order_by,
skip=(page - 1) * page_size,

View File

@@ -39,6 +39,18 @@ class MarketplaceListing(pydantic.BaseModel):
creator: MarketplaceListingCreator
class RecentExecution(pydantic.BaseModel):
"""Summary of a recent execution for quality assessment.
Used by the LLM to understand the agent's recent performance with specific examples
rather than just aggregate statistics.
"""
status: str # COMPLETED, FAILED, RUNNING, QUEUED
correctness_score: float | None = None # 0-1 score if evaluated
activity_summary: str | None = None # AI-generated summary of what happened
class LibraryAgent(pydantic.BaseModel):
"""
Represents an agent in the library, including metadata for display and
@@ -84,6 +96,19 @@ class LibraryAgent(pydantic.BaseModel):
# Indicates whether there's a new output (based on recent runs)
new_output: bool
# Execution metrics (for quality assessment by LLM)
execution_count: int = 0 # Number of recent executions sampled
success_rate: float | None = (
None # Percentage (0-100) of technically successful executions
)
avg_correctness_score: float | None = (
None # 0-1 score of how well executions achieved their purpose
)
recent_executions: list[RecentExecution] = pydantic.Field(
default_factory=list,
description="List of recent executions with status, score, and summary",
)
# Whether the user can access the underlying graph
can_access_graph: bool
@@ -145,6 +170,50 @@ class LibraryAgent(pydantic.BaseModel):
status = status_result.status
new_output = status_result.new_output
# Calculate execution metrics
execution_count = len(executions)
success_rate: float | None = None
avg_correctness_score: float | None = None
if execution_count > 0:
success_count = sum(
1
for e in executions
if e.executionStatus == prisma.enums.AgentExecutionStatus.COMPLETED
)
success_rate = (success_count / execution_count) * 100
# Calculate average correctness score from execution stats
correctness_scores = []
for e in executions:
if e.stats and isinstance(e.stats, dict):
score = e.stats.get("correctness_score")
if score is not None and isinstance(score, (int, float)):
correctness_scores.append(float(score))
if correctness_scores:
avg_correctness_score = sum(correctness_scores) / len(
correctness_scores
)
# Build recent executions list with status, score, and summary
recent_executions: list[RecentExecution] = []
for e in executions:
exec_score: float | None = None
exec_summary: str | None = None
if e.stats and isinstance(e.stats, dict):
score = e.stats.get("correctness_score")
if score is not None and isinstance(score, (int, float)):
exec_score = float(score)
summary = e.stats.get("activity_status")
if summary is not None and isinstance(summary, str):
exec_summary = summary
recent_executions.append(
RecentExecution(
status=e.executionStatus.value,
correctness_score=exec_score,
activity_summary=exec_summary,
)
)
# Check if user can access the graph
can_access_graph = agent.AgentGraph.userId == agent.userId
@@ -190,6 +259,10 @@ class LibraryAgent(pydantic.BaseModel):
has_sensitive_action=graph.has_sensitive_action,
trigger_setup_info=graph.trigger_setup_info,
new_output=new_output,
execution_count=execution_count,
success_rate=success_rate,
avg_correctness_score=avg_correctness_score,
recent_executions=recent_executions,
can_access_graph=can_access_graph,
is_latest_version=is_latest_version,
is_favorite=agent.isFavorite,