feat(backend): parameterize activity status generation with customizable prompts (#11407)

## Summary Implement comprehensive parameterization of the activity status generation system to enable custom prompts for admin analytics dashboard. ## Changes Made ### Core Function Enhancement (`activity_status_generator.py`) - **Extract hardcoded prompts to constants**: `DEFAULT_SYSTEM_PROMPT` and `DEFAULT_USER_PROMPT` - **Add prompt parameters**: `system_prompt`, `user_prompt` with defaults to maintain backward compatibility - **Template substitution system**: User prompt supports `{{GRAPH_NAME}}` and `{{EXECUTION_DATA}}` placeholders - **Skip existing flag**: `skip_existing` parameter allows admin to force regeneration of existing data - **Maintain manager compatibility**: All existing calls continue to work with default parameters ### Admin API Enhancement (`execution_analytics_routes.py`) - **Custom prompt fields**: `system_prompt` and `user_prompt` optional fields in `ExecutionAnalyticsRequest` - **Skip existing control**: `skip_existing` boolean flag for admin regeneration option - **Template documentation**: Clear documentation of placeholder system in field descriptions - **Backward compatibility**: All existing API calls work unchanged ### Template System Design - **Simple placeholder replacement**: `{{GRAPH_NAME}}` → actual graph name, `{{EXECUTION_DATA}}` → JSON execution data - **No dependencies**: Uses simple `string.replace()` for maximum compatibility - **JSON safety**: Execution data properly serialized as indented JSON - **Validation tested**: Template substitution verified to work correctly ## Key Features ### For Regular Users (Manager Integration) - **No changes required**: Existing manager.py calls work unchanged - **Default behavior preserved**: Same prompts and logic as before - **Feature flag compatibility**: LaunchDarkly integration unchanged ### For Admin Analytics Dashboard - **Custom system prompts**: Admins can override the AI evaluation criteria - **Custom user prompts**: Admins can modify the analysis instructions with execution data templates - **Force regeneration**: `skip_existing=False` allows reprocessing existing executions with new prompts - **Complete model list**: Access to all LLM models from `llm.py` (70+ models including GPT, Claude, Gemini, etc.) ## Technical Validation - ✅ Template substitution tested and working - ✅ Default behavior preserved for existing code - ✅ Admin API parameter validation working - ✅ All imports and function signatures correct - ✅ Backward compatibility maintained ## Use Cases Enabled - **A/B testing**: Compare different prompt strategies on same execution data - **Custom evaluation**: Tailor success criteria for specific graph types - **Prompt optimization**: Iterate on prompt design based on admin feedback - **Bulk reprocessing**: Regenerate activity status with improved prompts ## Testing - Template substitution functionality verified - Function signatures and imports validated - Code formatting and linting passed - Backward compatibility confirmed ## Breaking Changes None - all existing functionality preserved with default parameters. ## Related Issues Resolves the requirement to expose prompt customization on the frontend execution analytics dashboard. --------- Co-authored-by: Claude <noreply@anthropic.com>
2026-01-09 15:17:59 -05:00 · 2025-11-19 20:38:08 +07:00
parent 9438817702
commit 901bb31e14
5 changed files with 535 additions and 123 deletions
--- a/autogpt_platform/backend/backend/data/execution.py
+++ b/autogpt_platform/backend/backend/data/execution.py
@@ -460,6 +460,7 @@ class NodeExecutionResult(BaseModel):
 async def get_graph_executions(
    graph_exec_id: Optional[str] = None,
    graph_id: Optional[str] = None,
+    graph_version: Optional[int] = None,
    user_id: Optional[str] = None,
    statuses: Optional[list[ExecutionStatus]] = None,
    created_time_gte: Optional[datetime] = None,
@@ -476,6 +477,8 @@ async def get_graph_executions(
        where_filter["userId"] = user_id
    if graph_id:
        where_filter["agentGraphId"] = graph_id
+    if graph_version is not None:
+        where_filter["agentGraphVersion"] = graph_version
    if created_time_gte or created_time_lte:
        where_filter["createdAt"] = {
            "gte": created_time_gte or datetime.min.replace(tzinfo=timezone.utc),
--- a/autogpt_platform/backend/backend/executor/activity_status_generator.py
+++ b/autogpt_platform/backend/backend/executor/activity_status_generator.py
@@ -27,6 +27,101 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)


+# Default system prompt template for activity status generation
+DEFAULT_SYSTEM_PROMPT = """You are an AI assistant analyzing what an agent execution accomplished and whether it worked correctly. 
+You need to provide both a user-friendly summary AND a correctness assessment.
+
+FOR THE ACTIVITY STATUS:
+- Write from the user's perspective about what they accomplished, NOT about technical execution details
+- Focus on the ACTUAL TASK the user wanted done, not the internal workflow steps
+- Avoid technical terms like 'workflow', 'execution', 'components', 'nodes', 'processing', etc.
+- Keep it to 3 sentences maximum. Be conversational and human-friendly
+
+FOR THE CORRECTNESS SCORE:
+- Provide a score from 0.0 to 1.0 indicating how well the execution achieved its intended purpose
+- Use this scoring guide:
+  0.0-0.2: Failure - The result clearly did not meet the task requirements
+  0.2-0.4: Poor - Major issues; only small parts of the goal were achieved
+  0.4-0.6: Partial Success - Some objectives met, but with noticeable gaps or inaccuracies
+  0.6-0.8: Mostly Successful - Largely achieved the intended outcome, with minor flaws
+  0.8-1.0: Success - Fully met or exceeded the task requirements
+- Base the score on actual outputs produced, not just technical completion
+
+UNDERSTAND THE INTENDED PURPOSE:
+- FIRST: Read the graph description carefully to understand what the user wanted to accomplish
+- The graph name and description tell you the main goal/intention of this automation
+- Use this intended purpose as your PRIMARY criteria for success/failure evaluation
+- Ask yourself: 'Did this execution actually accomplish what the graph was designed to do?'
+
+CRITICAL OUTPUT ANALYSIS:
+- Check if blocks that should produce user-facing results actually produced outputs
+- Blocks with names containing 'Output', 'Post', 'Create', 'Send', 'Publish', 'Generate' are usually meant to produce final results
+- If these critical blocks have NO outputs (empty recent_outputs), the task likely FAILED even if status shows 'completed'
+- Sub-agents (AgentExecutorBlock) that produce no outputs usually indicate failed sub-tasks
+- Most importantly: Does the execution result match what the graph description promised to deliver?
+
+SUCCESS EVALUATION BASED ON INTENTION:
+- If the graph is meant to 'create blog posts' → check if blog content was actually created
+- If the graph is meant to 'send emails' → check if emails were actually sent
+- If the graph is meant to 'analyze data' → check if analysis results were produced
+- If the graph is meant to 'generate reports' → check if reports were generated
+- Technical completion ≠ goal achievement. Focus on whether the USER'S INTENDED OUTCOME was delivered
+
+IMPORTANT: Be HONEST about what actually happened:
+- If the input was invalid/nonsensical, say so directly
+- If the task failed, explain what went wrong in simple terms
+- If errors occurred, focus on what the user needs to know
+- Only claim success if the INTENDED PURPOSE was genuinely accomplished AND produced expected outputs
+- Don't sugar-coat failures or present them as helpful feedback
+- ESPECIALLY: If the graph's main purpose wasn't achieved, this is a failure regardless of 'completed' status
+
+Understanding Errors:
+- Node errors: Individual steps may fail but the overall task might still complete (e.g., one data source fails but others work)
+- Graph error (in overall_status.graph_error): This means the entire execution failed and nothing was accomplished
+- Missing outputs from critical blocks: Even if no errors, this means the task failed to produce expected results
+- Focus on whether the graph's intended purpose was fulfilled, not whether technical steps completed"""
+
+# Default user prompt template for activity status generation
+DEFAULT_USER_PROMPT = """A user ran '{{GRAPH_NAME}}' to accomplish something. Based on this execution data, 
+provide both an activity summary and correctness assessment:
+
+{{EXECUTION_DATA}}
+
+ANALYSIS CHECKLIST:
+1. READ graph_info.description FIRST - this tells you what the user intended to accomplish
+2. Check overall_status.graph_error - if present, the entire execution failed
+3. Look for nodes with 'Output', 'Post', 'Create', 'Send', 'Publish', 'Generate' in their block_name
+4. Check if these critical blocks have empty recent_outputs arrays - this indicates failure
+5. Look for AgentExecutorBlock (sub-agents) with no outputs - this suggests sub-task failures
+6. Count how many nodes produced outputs vs total nodes - low ratio suggests problems
+7. MOST IMPORTANT: Does the execution outcome match what graph_info.description promised?
+
+INTENTION-BASED EVALUATION:
+- If description mentions 'blog writing' → did it create blog content?
+- If description mentions 'email automation' → were emails actually sent?
+- If description mentions 'data analysis' → were analysis results produced?
+- If description mentions 'content generation' → was content actually generated?
+- If description mentions 'social media posting' → were posts actually made?
+- Match the outputs to the stated intention, not just technical completion
+
+PROVIDE:
+activity_status: 1-3 sentences about what the user accomplished, such as:
+- 'I analyzed your resume and provided detailed feedback for the IT industry.'
+- 'I couldn't complete the task because critical steps failed to produce any results.'
+- 'I failed to generate the content you requested due to missing API access.'
+- 'I extracted key information from your documents and organized it into a summary.'
+- 'The task failed because the blog post creation step didn't produce any output.'
+
+correctness_score: A float score from 0.0 to 1.0 based on how well the intended purpose was achieved:
+- 0.0-0.2: Failure (didn't meet requirements)
+- 0.2-0.4: Poor (major issues, minimal achievement)
+- 0.4-0.6: Partial Success (some objectives met with gaps)
+- 0.6-0.8: Mostly Successful (largely achieved with minor flaws)
+- 0.8-1.0: Success (fully met or exceeded requirements)
+
+BE CRITICAL: If the graph's intended purpose (from description) wasn't achieved, use a low score (0.0-0.4) even if status is 'completed'."""
+
+
 class ErrorInfo(TypedDict):
    """Type definition for error information."""

@@ -93,6 +188,9 @@ async def generate_activity_status_for_execution(
    execution_status: ExecutionStatus | None = None,
    model_name: str = "gpt-4o-mini",
    skip_feature_flag: bool = False,
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+    user_prompt: str = DEFAULT_USER_PROMPT,
+    skip_existing: bool = True,
 ) -> ActivityStatusResponse | None:
    """
    Generate an AI-based activity status summary and correctness assessment for a graph execution.
@@ -108,10 +206,15 @@ async def generate_activity_status_for_execution(
        db_client: Database client for fetching data
        user_id: User ID for LaunchDarkly feature flag evaluation
        execution_status: The overall execution status (COMPLETED, FAILED, TERMINATED)
+        model_name: AI model to use for generation (default: gpt-4o-mini)
+        skip_feature_flag: Whether to skip LaunchDarkly feature flag check
+        system_prompt: Custom system prompt template (default: DEFAULT_SYSTEM_PROMPT)
+        user_prompt: Custom user prompt template with placeholders (default: DEFAULT_USER_PROMPT)
+        skip_existing: Whether to skip if activity_status and correctness_score already exist

    Returns:
        AI-generated activity status response with activity_status and correctness_status,
-        or None if feature is disabled
+        or None if feature is disabled or skipped
    """
    # Check LaunchDarkly feature flag for AI activity status generation with full context support
    if not skip_feature_flag and not await is_feature_enabled(
@@ -120,6 +223,20 @@ async def generate_activity_status_for_execution(
        logger.debug("AI activity status generation is disabled via LaunchDarkly")
        return None

+    # Check if we should skip existing data (for admin regeneration option)
+    if (
+        skip_existing
+        and execution_stats.activity_status
+        and execution_stats.correctness_score is not None
+    ):
+        logger.debug(
+            f"Skipping activity status generation for {graph_exec_id}: already exists"
+        )
+        return {
+            "activity_status": execution_stats.activity_status,
+            "correctness_score": execution_stats.correctness_score,
+        }
+
    # Check if we have OpenAI API key
    try:
        settings = Settings()
@@ -157,94 +274,23 @@ async def generate_activity_status_for_execution(
            execution_status,
        )

+        # Prepare execution data as JSON for template substitution
+        execution_data_json = json.dumps(execution_data, indent=2)
+
+        # Perform template substitution for user prompt
+        user_prompt_content = user_prompt.replace("{{GRAPH_NAME}}", graph_name).replace(
+            "{{EXECUTION_DATA}}", execution_data_json
+        )
+
        # Prepare prompt for AI with structured output requirements
        prompt = [
            {
                "role": "system",
-                "content": (
-                    "You are an AI assistant analyzing what an agent execution accomplished and whether it worked correctly. "
-                    "You need to provide both a user-friendly summary AND a correctness assessment.\n\n"
-                    "FOR THE ACTIVITY STATUS:\n"
-                    "- Write from the user's perspective about what they accomplished, NOT about technical execution details\n"
-                    "- Focus on the ACTUAL TASK the user wanted done, not the internal workflow steps\n"
-                    "- Avoid technical terms like 'workflow', 'execution', 'components', 'nodes', 'processing', etc.\n"
-                    "- Keep it to 3 sentences maximum. Be conversational and human-friendly\n\n"
-                    "FOR THE CORRECTNESS SCORE:\n"
-                    "- Provide a score from 0.0 to 1.0 indicating how well the execution achieved its intended purpose\n"
-                    "- Use this scoring guide:\n"
-                    "  0.0-0.2: Failure - The result clearly did not meet the task requirements\n"
-                    "  0.2-0.4: Poor - Major issues; only small parts of the goal were achieved\n"
-                    "  0.4-0.6: Partial Success - Some objectives met, but with noticeable gaps or inaccuracies\n"
-                    "  0.6-0.8: Mostly Successful - Largely achieved the intended outcome, with minor flaws\n"
-                    "  0.8-1.0: Success - Fully met or exceeded the task requirements\n"
-                    "- Base the score on actual outputs produced, not just technical completion\n\n"
-                    "UNDERSTAND THE INTENDED PURPOSE:\n"
-                    "- FIRST: Read the graph description carefully to understand what the user wanted to accomplish\n"
-                    "- The graph name and description tell you the main goal/intention of this automation\n"
-                    "- Use this intended purpose as your PRIMARY criteria for success/failure evaluation\n"
-                    "- Ask yourself: 'Did this execution actually accomplish what the graph was designed to do?'\n\n"
-                    "CRITICAL OUTPUT ANALYSIS:\n"
-                    "- Check if blocks that should produce user-facing results actually produced outputs\n"
-                    "- Blocks with names containing 'Output', 'Post', 'Create', 'Send', 'Publish', 'Generate' are usually meant to produce final results\n"
-                    "- If these critical blocks have NO outputs (empty recent_outputs), the task likely FAILED even if status shows 'completed'\n"
-                    "- Sub-agents (AgentExecutorBlock) that produce no outputs usually indicate failed sub-tasks\n"
-                    "- Most importantly: Does the execution result match what the graph description promised to deliver?\n\n"
-                    "SUCCESS EVALUATION BASED ON INTENTION:\n"
-                    "- If the graph is meant to 'create blog posts' → check if blog content was actually created\n"
-                    "- If the graph is meant to 'send emails' → check if emails were actually sent\n"
-                    "- If the graph is meant to 'analyze data' → check if analysis results were produced\n"
-                    "- If the graph is meant to 'generate reports' → check if reports were generated\n"
-                    "- Technical completion ≠ goal achievement. Focus on whether the USER'S INTENDED OUTCOME was delivered\n\n"
-                    "IMPORTANT: Be HONEST about what actually happened:\n"
-                    "- If the input was invalid/nonsensical, say so directly\n"
-                    "- If the task failed, explain what went wrong in simple terms\n"
-                    "- If errors occurred, focus on what the user needs to know\n"
-                    "- Only claim success if the INTENDED PURPOSE was genuinely accomplished AND produced expected outputs\n"
-                    "- Don't sugar-coat failures or present them as helpful feedback\n"
-                    "- ESPECIALLY: If the graph's main purpose wasn't achieved, this is a failure regardless of 'completed' status\n\n"
-                    "Understanding Errors:\n"
-                    "- Node errors: Individual steps may fail but the overall task might still complete (e.g., one data source fails but others work)\n"
-                    "- Graph error (in overall_status.graph_error): This means the entire execution failed and nothing was accomplished\n"
-                    "- Missing outputs from critical blocks: Even if no errors, this means the task failed to produce expected results\n"
-                    "- Focus on whether the graph's intended purpose was fulfilled, not whether technical steps completed"
-                ),
+                "content": system_prompt,
            },
            {
                "role": "user",
-                "content": (
-                    f"A user ran '{graph_name}' to accomplish something. Based on this execution data, "
-                    f"provide both an activity summary and correctness assessment:\n\n"
-                    f"{json.dumps(execution_data, indent=2)}\n\n"
-                    "ANALYSIS CHECKLIST:\n"
-                    "1. READ graph_info.description FIRST - this tells you what the user intended to accomplish\n"
-                    "2. Check overall_status.graph_error - if present, the entire execution failed\n"
-                    "3. Look for nodes with 'Output', 'Post', 'Create', 'Send', 'Publish', 'Generate' in their block_name\n"
-                    "4. Check if these critical blocks have empty recent_outputs arrays - this indicates failure\n"
-                    "5. Look for AgentExecutorBlock (sub-agents) with no outputs - this suggests sub-task failures\n"
-                    "6. Count how many nodes produced outputs vs total nodes - low ratio suggests problems\n"
-                    "7. MOST IMPORTANT: Does the execution outcome match what graph_info.description promised?\n\n"
-                    "INTENTION-BASED EVALUATION:\n"
-                    "- If description mentions 'blog writing' → did it create blog content?\n"
-                    "- If description mentions 'email automation' → were emails actually sent?\n"
-                    "- If description mentions 'data analysis' → were analysis results produced?\n"
-                    "- If description mentions 'content generation' → was content actually generated?\n"
-                    "- If description mentions 'social media posting' → were posts actually made?\n"
-                    "- Match the outputs to the stated intention, not just technical completion\n\n"
-                    "PROVIDE:\n"
-                    "activity_status: 1-3 sentences about what the user accomplished, such as:\n"
-                    "- 'I analyzed your resume and provided detailed feedback for the IT industry.'\n"
-                    "- 'I couldn't complete the task because critical steps failed to produce any results.'\n"
-                    "- 'I failed to generate the content you requested due to missing API access.'\n"
-                    "- 'I extracted key information from your documents and organized it into a summary.'\n"
-                    "- 'The task failed because the blog post creation step didn't produce any output.'\n\n"
-                    "correctness_score: A float score from 0.0 to 1.0 based on how well the intended purpose was achieved:\n"
-                    "- 0.0-0.2: Failure (didn't meet requirements)\n"
-                    "- 0.2-0.4: Poor (major issues, minimal achievement)\n"
-                    "- 0.4-0.6: Partial Success (some objectives met with gaps)\n"
-                    "- 0.6-0.8: Mostly Successful (largely achieved with minor flaws)\n"
-                    "- 0.8-1.0: Success (fully met or exceeded requirements)\n\n"
-                    "BE CRITICAL: If the graph's intended purpose (from description) wasn't achieved, use a low score (0.0-0.4) even if status is 'completed'."
-                ),
+                "content": user_prompt_content,
            },
        ]

--- a/autogpt_platform/backend/backend/server/v2/admin/execution_analytics_routes.py
+++ b/autogpt_platform/backend/backend/server/v2/admin/execution_analytics_routes.py
@@ -7,6 +7,7 @@ from autogpt_libs.auth import get_user_id, requires_admin_user
 from fastapi import APIRouter, HTTPException, Security
 from pydantic import BaseModel, Field

+from backend.blocks.llm import LlmModel
 from backend.data.execution import (
    ExecutionStatus,
    GraphExecutionMeta,
@@ -15,6 +16,8 @@ from backend.data.execution import (
 )
 from backend.data.model import GraphExecutionStats
 from backend.executor.activity_status_generator import (
+    DEFAULT_SYSTEM_PROMPT,
+    DEFAULT_USER_PROMPT,
    generate_activity_status_for_execution,
 )
 from backend.executor.manager import get_db_async_client
@@ -30,12 +33,21 @@ class ExecutionAnalyticsRequest(BaseModel):
    created_after: Optional[datetime] = Field(
        None, description="Optional created date lower bound"
    )
-    model_name: Optional[str] = Field(
-        "gpt-4o-mini", description="Model to use for generation"
-    )
+    model_name: str = Field("gpt-4o-mini", description="Model to use for generation")
    batch_size: int = Field(
        10, description="Batch size for concurrent processing", le=25, ge=1
    )
+    system_prompt: Optional[str] = Field(
+        None, description="Custom system prompt (default: built-in prompt)"
+    )
+    user_prompt: Optional[str] = Field(
+        None,
+        description="Custom user prompt with {{GRAPH_NAME}} and {{EXECUTION_DATA}} placeholders (default: built-in prompt)",
+    )
+    skip_existing: bool = Field(
+        True,
+        description="Whether to skip executions that already have activity status and correctness score",
+    )


 class ExecutionAnalyticsResult(BaseModel):
@@ -58,6 +70,19 @@ class ExecutionAnalyticsResponse(BaseModel):
    results: list[ExecutionAnalyticsResult]


+class ModelInfo(BaseModel):
+    value: str
+    label: str
+    provider: str
+
+
+class ExecutionAnalyticsConfig(BaseModel):
+    available_models: list[ModelInfo]
+    default_system_prompt: str
+    default_user_prompt: str
+    recommended_model: str
+
+
 router = APIRouter(
    prefix="/admin",
    tags=["admin", "execution_analytics"],
@@ -65,6 +90,100 @@ router = APIRouter(
 )


+@router.get(
+    "/execution_analytics/config",
+    response_model=ExecutionAnalyticsConfig,
+    summary="Get Execution Analytics Configuration",
+)
+async def get_execution_analytics_config(
+    admin_user_id: str = Security(get_user_id),
+):
+    """
+    Get the configuration for execution analytics including:
+    - Available AI models with metadata
+    - Default system and user prompts
+    - Recommended model selection
+    """
+    logger.info(f"Admin user {admin_user_id} requesting execution analytics config")
+
+    # Generate model list from LlmModel enum with provider information
+    available_models = []
+
+    # Function to generate friendly display names from model values
+    def generate_model_label(model: LlmModel) -> str:
+        """Generate a user-friendly label from the model enum value."""
+        value = model.value
+
+        # For all models, convert underscores/hyphens to spaces and title case
+        # e.g., "gpt-4-turbo" -> "GPT 4 Turbo", "claude-3-haiku-20240307" -> "Claude 3 Haiku"
+        parts = value.replace("_", "-").split("-")
+
+        # Handle provider prefixes (e.g., "google/", "x-ai/")
+        if "/" in value:
+            _, model_name = value.split("/", 1)
+            parts = model_name.replace("_", "-").split("-")
+
+        # Capitalize and format parts
+        formatted_parts = []
+        for part in parts:
+            # Skip date-like patterns - check for various date formats:
+            # - Long dates like "20240307" (8 digits)
+            # - Year components like "2024", "2025" (4 digit years >= 2020)
+            # - Month/day components like "04", "16" when they appear to be dates
+            if part.isdigit():
+                if len(part) >= 8:  # Long date format like "20240307"
+                    continue
+                elif len(part) == 4 and int(part) >= 2020:  # Year like "2024", "2025"
+                    continue
+                elif len(part) <= 2 and int(part) <= 31:  # Month/day like "04", "16"
+                    # Skip if this looks like a date component (basic heuristic)
+                    continue
+            # Keep version numbers as-is
+            if part.replace(".", "").isdigit():
+                formatted_parts.append(part)
+            # Capitalize normal words
+            else:
+                formatted_parts.append(
+                    part.upper()
+                    if part.upper() in ["GPT", "LLM", "API", "V0"]
+                    else part.capitalize()
+                )
+
+        model_name = " ".join(formatted_parts)
+
+        # Format provider name for better display
+        provider_name = model.provider.replace("_", " ").title()
+
+        # Return with provider prefix for clarity
+        return f"{provider_name}: {model_name}"
+
+    # Include all LlmModel values (no more filtering by hardcoded list)
+    recommended_model = LlmModel.GPT4O_MINI.value
+    for model in LlmModel:
+        label = generate_model_label(model)
+        # Add "(Recommended)" suffix to the recommended model
+        if model.value == recommended_model:
+            label += " (Recommended)"
+
+        available_models.append(
+            ModelInfo(
+                value=model.value,
+                label=label,
+                provider=model.provider,
+            )
+        )
+
+    # Sort models by provider and name for better UX
+    available_models.sort(key=lambda x: (x.provider, x.label))
+
+    return ExecutionAnalyticsConfig(
+        available_models=available_models,
+        default_system_prompt=DEFAULT_SYSTEM_PROMPT,
+        default_user_prompt=DEFAULT_USER_PROMPT,
+        recommended_model=recommended_model,
+    )
+
+
@router.post(
    "/execution_analytics",
    response_model=ExecutionAnalyticsResponse,
@@ -100,6 +219,7 @@ async def generate_execution_analytics(
        # Fetch executions to process
        executions = await get_graph_executions(
            graph_id=request.graph_id,
+            graph_version=request.graph_version,
            user_id=request.user_id,
            created_time_gte=request.created_after,
            statuses=[
@@ -113,21 +233,20 @@ async def generate_execution_analytics(
            f"Found {len(executions)} total executions for graph {request.graph_id}"
        )

-        # Filter executions that need analytics generation (missing activity_status or correctness_score)
+        # Filter executions that need analytics generation
        executions_to_process = []
        for execution in executions:
+            # Skip if we should skip existing analytics and both activity_status and correctness_score exist
            if (
-                not execution.stats
-                or not execution.stats.activity_status
-                or execution.stats.correctness_score is None
+                request.skip_existing
+                and execution.stats
+                and execution.stats.activity_status
+                and execution.stats.correctness_score is not None
            ):
+                continue

-                # If version is specified, filter by it
-                if (
-                    request.graph_version is None
-                    or execution.graph_version == request.graph_version
-                ):
-                    executions_to_process.append(execution)
+            # Add execution to processing list
+            executions_to_process.append(execution)

        logger.info(
            f"Found {len(executions_to_process)} executions needing analytics generation"
@@ -152,9 +271,7 @@ async def generate_execution_analytics(
                    f"Processing batch {batch_idx + 1}/{total_batches} with {len(batch)} executions"
                )

-                batch_results = await _process_batch(
-                    batch, request.model_name or "gpt-4o-mini", db_client
-                )
+                batch_results = await _process_batch(batch, request, db_client)

                for result in batch_results:
                    results.append(result)
@@ -212,7 +329,7 @@ async def generate_execution_analytics(


 async def _process_batch(
-    executions, model_name: str, db_client
+    executions, request: ExecutionAnalyticsRequest, db_client
 ) -> list[ExecutionAnalyticsResult]:
    """Process a batch of executions concurrently."""

@@ -237,8 +354,11 @@ async def _process_batch(
                db_client=db_client,
                user_id=execution.user_id,
                execution_status=execution.status,
-                model_name=model_name,  # Pass model name parameter
+                model_name=request.model_name,
                skip_feature_flag=True,  # Admin endpoint bypasses feature flags
+                system_prompt=request.system_prompt or DEFAULT_SYSTEM_PROMPT,
+                user_prompt=request.user_prompt or DEFAULT_USER_PROMPT,
+                skip_existing=request.skip_existing,
            )

            if not activity_response:
--- a/autogpt_platform/frontend/src/app/(platform)/admin/execution-analytics/components/ExecutionAnalyticsForm.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/admin/execution-analytics/components/ExecutionAnalyticsForm.tsx
@@ -1,6 +1,6 @@
 "use client";

-import { useState } from "react";
+import { useState, useEffect } from "react";
 import { Button } from "@/components/atoms/Button/Button";
 import { Input } from "@/components/__legacy__/ui/input";
 import { Label } from "@/components/__legacy__/ui/label";
@@ -11,36 +11,37 @@ import {
  SelectTrigger,
  SelectValue,
 } from "@/components/__legacy__/ui/select";
+import { Textarea } from "@/components/__legacy__/ui/textarea";
+import { Checkbox } from "@/components/__legacy__/ui/checkbox";
+import { Collapsible } from "@/components/molecules/Collapsible/Collapsible";
 import { useToast } from "@/components/molecules/Toast/use-toast";
-import { usePostV2GenerateExecutionAnalytics } from "@/app/api/__generated__/endpoints/admin/admin";
+import {
+  usePostV2GenerateExecutionAnalytics,
+  useGetV2GetExecutionAnalyticsConfiguration,
+} from "@/app/api/__generated__/endpoints/admin/admin";
 import type { ExecutionAnalyticsRequest } from "@/app/api/__generated__/models/executionAnalyticsRequest";
 import type { ExecutionAnalyticsResponse } from "@/app/api/__generated__/models/executionAnalyticsResponse";

-// Local interface for form state to simplify handling
-interface FormData {
-  graph_id: string;
-  graph_version?: number;
-  user_id?: string;
-  created_after?: string;
-  model_name: string;
-  batch_size: number;
+// Use the generated type with minimal adjustment for form handling
+interface FormData extends Omit<ExecutionAnalyticsRequest, "created_after"> {
+  created_after?: string; // Keep as string for datetime-local input
+  // All other fields use the generated types as-is
 }
 import { AnalyticsResultsTable } from "./AnalyticsResultsTable";

-const MODEL_OPTIONS = [
-  { value: "gpt-4o-mini", label: "GPT-4o Mini (Recommended)" },
-  { value: "gpt-4o", label: "GPT-4o" },
-  { value: "gpt-4-turbo", label: "GPT-4 Turbo" },
-  { value: "gpt-4.1", label: "GPT-4.1" },
-  { value: "gpt-4.1-mini", label: "GPT-4.1 Mini" },
-];
-
 export function ExecutionAnalyticsForm() {
  const [results, setResults] = useState<ExecutionAnalyticsResponse | null>(
    null,
  );
  const { toast } = useToast();

+  // Fetch configuration from API
+  const {
+    data: config,
+    isLoading: configLoading,
+    error: configError,
+  } = useGetV2GetExecutionAnalyticsConfiguration();
+
  const generateAnalytics = usePostV2GenerateExecutionAnalytics({
    mutation: {
      onSuccess: (res) => {
@@ -69,10 +70,23 @@ export function ExecutionAnalyticsForm() {

  const [formData, setFormData] = useState<FormData>({
    graph_id: "",
-    model_name: "gpt-4o-mini",
+    model_name: "", // Will be set from config
    batch_size: 10, // Fixed internal value
+    skip_existing: true, // Default to skip existing
+    system_prompt: "", // Will use config default when empty
+    user_prompt: "", // Will use config default when empty
  });

+  // Update form defaults when config loads
+  useEffect(() => {
+    if (config?.data && config.status === 200 && !formData.model_name) {
+      setFormData((prev) => ({
+        ...prev,
+        model_name: config.data.recommended_model,
+      }));
+    }
+  }, [config, formData.model_name]);
+
  const handleSubmit = async (e: React.FormEvent) => {
    e.preventDefault();

@@ -92,6 +106,7 @@ export function ExecutionAnalyticsForm() {
      graph_id: formData.graph_id.trim(),
      model_name: formData.model_name,
      batch_size: formData.batch_size,
+      skip_existing: formData.skip_existing,
    };

    if (formData.graph_version) {
@@ -110,6 +125,14 @@ export function ExecutionAnalyticsForm() {
      payload.created_after = new Date(formData.created_after.trim());
    }

+    if (formData.system_prompt?.trim()) {
+      payload.system_prompt = formData.system_prompt.trim();
+    }
+
+    if (formData.user_prompt?.trim()) {
+      payload.user_prompt = formData.user_prompt.trim();
+    }
+
    generateAnalytics.mutate({ data: payload });
  };

@@ -117,6 +140,26 @@ export function ExecutionAnalyticsForm() {
    setFormData((prev: FormData) => ({ ...prev, [field]: value }));
  };

+  // Show loading state while config loads
+  if (configLoading) {
+    return (
+      <div className="flex items-center justify-center py-8">
+        <div className="text-gray-500">Loading configuration...</div>
+      </div>
+    );
+  }
+
+  // Show error state if config fails to load
+  if (configError || !config?.data || config.status !== 200) {
+    return (
+      <div className="flex items-center justify-center py-8">
+        <div className="text-red-500">Failed to load configuration</div>
+      </div>
+    );
+  }
+
+  const configData = config.data;
+
  return (
    <div className="space-y-6">
      <form onSubmit={handleSubmit} className="space-y-4">
@@ -182,9 +225,9 @@ export function ExecutionAnalyticsForm() {
                <SelectValue placeholder="Select AI model" />
              </SelectTrigger>
              <SelectContent>
-                {MODEL_OPTIONS.map((option) => (
-                  <SelectItem key={option.value} value={option.value}>
-                    {option.label}
+                {configData.available_models.map((model) => (
+                  <SelectItem key={model.value} value={model.value}>
+                    {model.label}
                  </SelectItem>
                ))}
              </SelectContent>
@@ -192,6 +235,127 @@ export function ExecutionAnalyticsForm() {
          </div>
        </div>

+        {/* Advanced Options Section - Collapsible */}
+        <div className="border-t pt-6">
+          <Collapsible
+            trigger={
+              <h3 className="text-lg font-semibold text-gray-700">
+                Advanced Options
+              </h3>
+            }
+            defaultOpen={false}
+            className="space-y-4"
+          >
+            <div className="space-y-4 pt-4">
+              {/* Skip Existing Checkbox */}
+              <div className="flex items-center space-x-2">
+                <Checkbox
+                  id="skip_existing"
+                  checked={formData.skip_existing}
+                  onCheckedChange={(checked) =>
+                    handleInputChange("skip_existing", checked)
+                  }
+                />
+                <Label htmlFor="skip_existing" className="text-sm">
+                  Skip executions that already have activity status and
+                  correctness score
+                </Label>
+              </div>
+
+              {/* Custom System Prompt */}
+              <div className="space-y-2">
+                <Label htmlFor="system_prompt">
+                  Custom System Prompt (Optional)
+                </Label>
+                <Textarea
+                  id="system_prompt"
+                  value={formData.system_prompt || ""}
+                  onChange={(e) =>
+                    handleInputChange("system_prompt", e.target.value)
+                  }
+                  placeholder={configData.default_system_prompt}
+                  rows={6}
+                  className="resize-y"
+                />
+                <p className="text-sm text-gray-600">
+                  Customize how the AI evaluates execution success and failure.
+                  Leave empty to use the default prompt shown above.
+                </p>
+              </div>
+
+              {/* Custom User Prompt */}
+              <div className="space-y-2">
+                <Label htmlFor="user_prompt">
+                  Custom User Prompt Template (Optional)
+                </Label>
+                <Textarea
+                  id="user_prompt"
+                  value={formData.user_prompt || ""}
+                  onChange={(e) =>
+                    handleInputChange("user_prompt", e.target.value)
+                  }
+                  placeholder={configData.default_user_prompt}
+                  rows={8}
+                  className="resize-y"
+                />
+                <p className="text-sm text-gray-600">
+                  Customize the analysis instructions. Use{" "}
+                  <code className="rounded bg-gray-100 px-1">
+                    {"{{GRAPH_NAME}}"}
+                  </code>{" "}
+                  and{" "}
+                  <code className="rounded bg-gray-100 px-1">
+                    {"{{EXECUTION_DATA}}"}
+                  </code>{" "}
+                  as placeholders. Leave empty to use the default template shown
+                  above.
+                </p>
+              </div>
+
+              {/* Quick Actions */}
+              <div className="flex flex-wrap gap-2 border-t pt-4">
+                <Button
+                  type="button"
+                  variant="secondary"
+                  size="small"
+                  onClick={() => {
+                    handleInputChange(
+                      "system_prompt",
+                      configData.default_system_prompt,
+                    );
+                  }}
+                >
+                  Reset System Prompt
+                </Button>
+                <Button
+                  type="button"
+                  variant="secondary"
+                  size="small"
+                  onClick={() => {
+                    handleInputChange(
+                      "user_prompt",
+                      configData.default_user_prompt,
+                    );
+                  }}
+                >
+                  Reset User Prompt
+                </Button>
+                <Button
+                  type="button"
+                  variant="secondary"
+                  size="small"
+                  onClick={() => {
+                    handleInputChange("system_prompt", "");
+                    handleInputChange("user_prompt", "");
+                  }}
+                >
+                  Clear All Prompts
+                </Button>
+              </div>
+            </div>
+          </Collapsible>
+        </div>
+
        <div className="flex justify-end">
          <Button
            variant="primary"
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -3886,6 +3886,30 @@
        }
      }
    },
+    "/api/executions/admin/execution_analytics/config": {
+      "get": {
+        "tags": ["v2", "admin", "admin", "execution_analytics"],
+        "summary": "Get Execution Analytics Configuration",
+        "description": "Get the configuration for execution analytics including:\n- Available AI models with metadata\n- Default system and user prompts\n- Recommended model selection",
+        "operationId": "getV2Get execution analytics configuration",
+        "responses": {
+          "200": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ExecutionAnalyticsConfig"
+                }
+              }
+            }
+          },
+          "401": {
+            "$ref": "#/components/responses/HTTP401NotAuthenticatedError"
+          }
+        },
+        "security": [{ "HTTPBearerJWT": [] }]
+      }
+    },
    "/api/executions/admin/execution_analytics": {
      "post": {
        "tags": ["v2", "admin", "admin", "execution_analytics"],
@@ -5759,6 +5783,35 @@
        "required": ["url", "relevance_score"],
        "title": "Document"
      },
+      "ExecutionAnalyticsConfig": {
+        "properties": {
+          "available_models": {
+            "items": { "$ref": "#/components/schemas/ModelInfo" },
+            "type": "array",
+            "title": "Available Models"
+          },
+          "default_system_prompt": {
+            "type": "string",
+            "title": "Default System Prompt"
+          },
+          "default_user_prompt": {
+            "type": "string",
+            "title": "Default User Prompt"
+          },
+          "recommended_model": {
+            "type": "string",
+            "title": "Recommended Model"
+          }
+        },
+        "type": "object",
+        "required": [
+          "available_models",
+          "default_system_prompt",
+          "default_user_prompt",
+          "recommended_model"
+        ],
+        "title": "ExecutionAnalyticsConfig"
+      },
      "ExecutionAnalyticsRequest": {
        "properties": {
          "graph_id": {
@@ -5785,7 +5838,7 @@
            "description": "Optional created date lower bound"
          },
          "model_name": {
-            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "type": "string",
            "title": "Model Name",
            "description": "Model to use for generation",
            "default": "gpt-4o-mini"
@@ -5797,6 +5850,22 @@
            "title": "Batch Size",
            "description": "Batch size for concurrent processing",
            "default": 10
+          },
+          "system_prompt": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "System Prompt",
+            "description": "Custom system prompt (default: built-in prompt)"
+          },
+          "user_prompt": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "User Prompt",
+            "description": "Custom user prompt with {{GRAPH_NAME}} and {{EXECUTION_DATA}} placeholders (default: built-in prompt)"
+          },
+          "skip_existing": {
+            "type": "boolean",
+            "title": "Skip Existing",
+            "description": "Whether to skip executions that already have activity status and correctness score",
+            "default": true
          }
        },
        "type": "object",
@@ -6900,6 +6969,16 @@
        "required": ["query", "response"],
        "title": "Message"
      },
+      "ModelInfo": {
+        "properties": {
+          "value": { "type": "string", "title": "Value" },
+          "label": { "type": "string", "title": "Label" },
+          "provider": { "type": "string", "title": "Provider" }
+        },
+        "type": "object",
+        "required": ["value", "label", "provider"],
+        "title": "ModelInfo"
+      },
      "MyAgent": {
        "properties": {
          "agent_id": { "type": "string", "title": "Agent Id" },