feat(platform): add execution accuracy alert system (#11562)

## Summary <img width="1263" height="883" alt="image" src="https://github.com/user-attachments/assets/98d4f449-1897-4019-a599-846c27df4191" /> <img width="398" height="190" alt="image" src="https://github.com/user-attachments/assets/0138ac02-420d-4f96-b980-74eb41e3c968" /> - Add execution accuracy monitoring with moving averages and Discord alerts - Dashboard visualization for accuracy trends and alert detection - Hourly monitoring for marketplace agents (≥10 executions in 30 days) - Generated API client integration with type-safe models ## Features - **Moving Average Analysis**: 3-day vs 7-day comparison with configurable thresholds - **Discord Notifications**: Hourly alerts for accuracy drops ≥10% - **Dashboard UI**: Real-time trends visualization with alert status - **Type Safety**: Generated API hooks and models throughout - **Error Handling**: Graceful OpenAI configuration handling - **PostgreSQL Optimization**: Window functions for efficient trend queries ## Test plan - [x] Backend accuracy monitoring logic tested with sample data - [x] Frontend components using generated API hooks (no manual fetch) - [x] Discord notification integration working - [x] Admin authentication and authorization working - [x] All formatting and linting checks passing - [x] Error handling for missing OpenAI configuration - [x] Test data available with `test-accuracy-agent-001` 🤖 Generated with [Claude Code](https://claude.ai/code) --------- Co-authored-by: Claude <noreply@anthropic.com>
2026-01-08 22:58:01 -05:00 · 2025-12-09 02:28:57 +07:00
parent aaa8dcc5a8
commit c1e21d07e6
12 changed files with 916 additions and 34 deletions
--- a/autogpt_platform/backend/backend/data/analytics.py
+++ b/autogpt_platform/backend/backend/data/analytics.py
@@ -1,12 +1,45 @@
 import logging
+from datetime import datetime, timedelta, timezone
+from typing import Optional

 import prisma.types
+from pydantic import BaseModel

+from backend.data.db import query_raw_with_schema
 from backend.util.json import SafeJson

 logger = logging.getLogger(__name__)


+class AccuracyAlertData(BaseModel):
+    """Alert data when accuracy drops significantly."""
+
+    graph_id: str
+    user_id: Optional[str]
+    drop_percent: float
+    three_day_avg: float
+    seven_day_avg: float
+    detected_at: datetime
+
+
+class AccuracyLatestData(BaseModel):
+    """Latest execution accuracy data point."""
+
+    date: datetime
+    daily_score: Optional[float]
+    three_day_avg: Optional[float]
+    seven_day_avg: Optional[float]
+    fourteen_day_avg: Optional[float]
+
+
+class AccuracyTrendsResponse(BaseModel):
+    """Response model for accuracy trends and alerts."""
+
+    latest_data: AccuracyLatestData
+    alert: Optional[AccuracyAlertData]
+    historical_data: Optional[list[AccuracyLatestData]] = None
+
+
 async def log_raw_analytics(
    user_id: str,
    type: str,
@@ -43,3 +76,217 @@ async def log_raw_metric(
    )

    return result
+
+
+async def get_accuracy_trends_and_alerts(
+    graph_id: str,
+    days_back: int = 30,
+    user_id: Optional[str] = None,
+    drop_threshold: float = 10.0,
+    include_historical: bool = False,
+) -> AccuracyTrendsResponse:
+    """Get accuracy trends and detect alerts for a specific graph."""
+    query_template = """
+    WITH daily_scores AS (
+        SELECT 
+            DATE(e."createdAt") as execution_date,
+            AVG(CASE 
+                WHEN e.stats IS NOT NULL 
+                AND e.stats::json->>'correctness_score' IS NOT NULL
+                AND e.stats::json->>'correctness_score' != 'null'
+                THEN (e.stats::json->>'correctness_score')::float * 100
+                ELSE NULL 
+            END) as daily_score
+        FROM {schema_prefix}"AgentGraphExecution" e
+        WHERE e."agentGraphId" = $1::text
+            AND e."isDeleted" = false
+            AND e."createdAt" >= $2::timestamp
+            AND e."executionStatus" IN ('COMPLETED', 'FAILED', 'TERMINATED')
+            {user_filter}
+        GROUP BY DATE(e."createdAt")
+        HAVING COUNT(*) >= 3  -- Need at least 3 executions per day
+    ),
+    trends AS (
+        SELECT 
+            execution_date,
+            daily_score,
+            AVG(daily_score) OVER (
+                ORDER BY execution_date 
+                ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+            ) as three_day_avg,
+            AVG(daily_score) OVER (
+                ORDER BY execution_date 
+                ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
+            ) as seven_day_avg,
+            AVG(daily_score) OVER (
+                ORDER BY execution_date 
+                ROWS BETWEEN 13 PRECEDING AND CURRENT ROW
+            ) as fourteen_day_avg
+        FROM daily_scores
+    )
+    SELECT *,
+        CASE 
+            WHEN three_day_avg IS NOT NULL AND seven_day_avg IS NOT NULL AND seven_day_avg > 0
+            THEN ((seven_day_avg - three_day_avg) / seven_day_avg * 100)
+            ELSE NULL
+        END as drop_percent
+    FROM trends
+    ORDER BY execution_date DESC
+    {limit_clause}
+    """
+
+    start_date = datetime.now(timezone.utc) - timedelta(days=days_back)
+    params = [graph_id, start_date]
+    user_filter = ""
+    if user_id:
+        user_filter = 'AND e."userId" = $3::text'
+        params.append(user_id)
+
+    # Determine limit clause
+    limit_clause = "" if include_historical else "LIMIT 1"
+
+    final_query = query_template.format(
+        schema_prefix="{schema_prefix}",
+        user_filter=user_filter,
+        limit_clause=limit_clause,
+    )
+
+    result = await query_raw_with_schema(final_query, *params)
+
+    if not result:
+        return AccuracyTrendsResponse(
+            latest_data=AccuracyLatestData(
+                date=datetime.now(timezone.utc),
+                daily_score=None,
+                three_day_avg=None,
+                seven_day_avg=None,
+                fourteen_day_avg=None,
+            ),
+            alert=None,
+        )
+
+    latest = result[0]
+
+    alert = None
+    if (
+        latest["drop_percent"] is not None
+        and latest["drop_percent"] >= drop_threshold
+        and latest["three_day_avg"] is not None
+        and latest["seven_day_avg"] is not None
+    ):
+        alert = AccuracyAlertData(
+            graph_id=graph_id,
+            user_id=user_id,
+            drop_percent=float(latest["drop_percent"]),
+            three_day_avg=float(latest["three_day_avg"]),
+            seven_day_avg=float(latest["seven_day_avg"]),
+            detected_at=datetime.now(timezone.utc),
+        )
+
+    # Prepare historical data if requested
+    historical_data = None
+    if include_historical:
+        historical_data = []
+        for row in result:
+            historical_data.append(
+                AccuracyLatestData(
+                    date=row["execution_date"],
+                    daily_score=(
+                        float(row["daily_score"])
+                        if row["daily_score"] is not None
+                        else None
+                    ),
+                    three_day_avg=(
+                        float(row["three_day_avg"])
+                        if row["three_day_avg"] is not None
+                        else None
+                    ),
+                    seven_day_avg=(
+                        float(row["seven_day_avg"])
+                        if row["seven_day_avg"] is not None
+                        else None
+                    ),
+                    fourteen_day_avg=(
+                        float(row["fourteen_day_avg"])
+                        if row["fourteen_day_avg"] is not None
+                        else None
+                    ),
+                )
+            )
+
+    return AccuracyTrendsResponse(
+        latest_data=AccuracyLatestData(
+            date=latest["execution_date"],
+            daily_score=(
+                float(latest["daily_score"])
+                if latest["daily_score"] is not None
+                else None
+            ),
+            three_day_avg=(
+                float(latest["three_day_avg"])
+                if latest["three_day_avg"] is not None
+                else None
+            ),
+            seven_day_avg=(
+                float(latest["seven_day_avg"])
+                if latest["seven_day_avg"] is not None
+                else None
+            ),
+            fourteen_day_avg=(
+                float(latest["fourteen_day_avg"])
+                if latest["fourteen_day_avg"] is not None
+                else None
+            ),
+        ),
+        alert=alert,
+        historical_data=historical_data,
+    )
+
+
+class MarketplaceGraphData(BaseModel):
+    """Data structure for marketplace graph monitoring."""
+
+    graph_id: str
+    user_id: Optional[str]
+    execution_count: int
+
+
+async def get_marketplace_graphs_for_monitoring(
+    days_back: int = 30,
+    min_executions: int = 10,
+) -> list[MarketplaceGraphData]:
+    """Get published marketplace graphs with recent executions for monitoring."""
+    query_template = """
+    WITH marketplace_graphs AS (
+        SELECT DISTINCT 
+            slv."agentGraphId" as graph_id,
+            slv."agentGraphVersion" as graph_version
+        FROM {schema_prefix}"StoreListing" sl
+        JOIN {schema_prefix}"StoreListingVersion" slv ON sl."activeVersionId" = slv."id"
+        WHERE sl."hasApprovedVersion" = true
+            AND sl."isDeleted" = false
+    )
+    SELECT DISTINCT 
+        mg.graph_id,
+        NULL as user_id,  -- Marketplace graphs don't have a specific user_id for monitoring
+        COUNT(*) as execution_count
+    FROM marketplace_graphs mg
+    JOIN {schema_prefix}"AgentGraphExecution" e ON e."agentGraphId" = mg.graph_id
+    WHERE e."createdAt" >= $1::timestamp
+        AND e."isDeleted" = false
+        AND e."executionStatus" IN ('COMPLETED', 'FAILED', 'TERMINATED')
+    GROUP BY mg.graph_id
+    HAVING COUNT(*) >= $2
+    ORDER BY execution_count DESC
+    """
+    start_date = datetime.now(timezone.utc) - timedelta(days=days_back)
+    result = await query_raw_with_schema(query_template, start_date, min_executions)
+
+    return [
+        MarketplaceGraphData(
+            graph_id=row["graph_id"],
+            user_id=row["user_id"],
+            execution_count=int(row["execution_count"]),
+        )
+        for row in result
+    ]
--- a/autogpt_platform/backend/backend/data/execution.py
+++ b/autogpt_platform/backend/backend/data/execution.py
@@ -1465,3 +1465,35 @@ async def get_graph_execution_by_share_token(
        created_at=execution.createdAt,
        outputs=outputs,
    )
+
+
+async def get_frequently_executed_graphs(
+    days_back: int = 30,
+    min_executions: int = 10,
+) -> list[dict]:
+    """Get graphs that have been frequently executed for monitoring."""
+    query_template = """
+    SELECT DISTINCT 
+        e."agentGraphId" as graph_id,
+        e."userId" as user_id,
+        COUNT(*) as execution_count
+    FROM {schema_prefix}"AgentGraphExecution" e
+    WHERE e."createdAt" >= $1::timestamp
+        AND e."isDeleted" = false
+        AND e."executionStatus" IN ('COMPLETED', 'FAILED', 'TERMINATED')
+    GROUP BY e."agentGraphId", e."userId"
+    HAVING COUNT(*) >= $2
+    ORDER BY execution_count DESC
+    """
+
+    start_date = datetime.now(timezone.utc) - timedelta(days=days_back)
+    result = await query_raw_with_schema(query_template, start_date, min_executions)
+
+    return [
+        {
+            "graph_id": row["graph_id"],
+            "user_id": row["user_id"],
+            "execution_count": int(row["execution_count"]),
+        }
+        for row in result
+    ]
--- a/autogpt_platform/backend/backend/executor/database.py
+++ b/autogpt_platform/backend/backend/executor/database.py
@@ -3,12 +3,17 @@ from contextlib import asynccontextmanager
 from typing import TYPE_CHECKING, Callable, Concatenate, ParamSpec, TypeVar, cast

 from backend.data import db
+from backend.data.analytics import (
+    get_accuracy_trends_and_alerts,
+    get_marketplace_graphs_for_monitoring,
+)
 from backend.data.credit import UsageTransactionMetadata, get_user_credit_model
 from backend.data.execution import (
    create_graph_execution,
    get_block_error_stats,
    get_child_graph_executions,
    get_execution_kv_data,
+    get_frequently_executed_graphs,
    get_graph_execution_meta,
    get_graph_executions,
    get_graph_executions_count,
@@ -145,6 +150,9 @@ class DatabaseManager(AppService):
    get_execution_kv_data = _(get_execution_kv_data)
    set_execution_kv_data = _(set_execution_kv_data)
    get_block_error_stats = _(get_block_error_stats)
+    get_accuracy_trends_and_alerts = _(get_accuracy_trends_and_alerts)
+    get_frequently_executed_graphs = _(get_frequently_executed_graphs)
+    get_marketplace_graphs_for_monitoring = _(get_marketplace_graphs_for_monitoring)

    # Graphs
    get_node = _(get_node)
@@ -226,6 +234,10 @@ class DatabaseManagerClient(AppServiceClient):

    # Block error monitoring
    get_block_error_stats = _(d.get_block_error_stats)
+    # Execution accuracy monitoring
+    get_accuracy_trends_and_alerts = _(d.get_accuracy_trends_and_alerts)
+    get_frequently_executed_graphs = _(d.get_frequently_executed_graphs)
+    get_marketplace_graphs_for_monitoring = _(d.get_marketplace_graphs_for_monitoring)

    # Human In The Loop
    has_pending_reviews_for_graph_exec = _(d.has_pending_reviews_for_graph_exec)
--- a/autogpt_platform/backend/backend/executor/scheduler.py
+++ b/autogpt_platform/backend/backend/executor/scheduler.py
@@ -33,6 +33,7 @@ from backend.monitoring import (
    process_existing_batches,
    process_weekly_summary,
    report_block_error_rates,
+    report_execution_accuracy_alerts,
    report_late_executions,
 )
 from backend.util.clients import get_scheduler_client
@@ -241,6 +242,11 @@ def cleanup_expired_files():
    run_async(cleanup_expired_files_async())


+def execution_accuracy_alerts():
+    """Check execution accuracy and send alerts if drops are detected."""
+    return report_execution_accuracy_alerts()
+
+
 # Monitoring functions are now imported from monitoring module


@@ -440,6 +446,17 @@ class Scheduler(AppService):
                jobstore=Jobstores.EXECUTION.value,
            )

+            # Execution Accuracy Monitoring - configurable interval
+            self.scheduler.add_job(
+                execution_accuracy_alerts,
+                id="report_execution_accuracy_alerts",
+                trigger="interval",
+                replace_existing=True,
+                seconds=config.execution_accuracy_check_interval_hours
+                * 3600,  # Convert hours to seconds
+                jobstore=Jobstores.EXECUTION.value,
+            )
+
        self.scheduler.add_listener(job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
        self.scheduler.add_listener(job_missed_listener, EVENT_JOB_MISSED)
        self.scheduler.add_listener(job_max_instances_listener, EVENT_JOB_MAX_INSTANCES)
@@ -587,6 +604,11 @@ class Scheduler(AppService):
        """Manually trigger cleanup of expired cloud storage files."""
        return cleanup_expired_files()

+    @expose
+    def execute_report_execution_accuracy_alerts(self):
+        """Manually trigger execution accuracy alert checking."""
+        return execution_accuracy_alerts()
+

 class SchedulerClient(AppServiceClient):
    @classmethod
--- a/autogpt_platform/backend/backend/monitoring/init.py
+++ b/autogpt_platform/backend/backend/monitoring/init.py
@@ -1,5 +1,6 @@
 """Monitoring module for platform health and alerting."""

+from .accuracy_monitor import AccuracyMonitor, report_execution_accuracy_alerts
 from .block_error_monitor import BlockErrorMonitor, report_block_error_rates
 from .late_execution_monitor import (
    LateExecutionException,
@@ -13,10 +14,12 @@ from .notification_monitor import (
 )

 __all__ = [
+    "AccuracyMonitor",
    "BlockErrorMonitor",
    "LateExecutionMonitor",
    "LateExecutionException",
    "NotificationJobArgs",
+    "report_execution_accuracy_alerts",
    "report_block_error_rates",
    "report_late_executions",
    "process_existing_batches",
--- a/autogpt_platform/backend/backend/monitoring/accuracy_monitor.py
+++ b/autogpt_platform/backend/backend/monitoring/accuracy_monitor.py
@@ -0,0 +1,107 @@
+"""Execution accuracy monitoring module."""
+
+import logging
+
+from backend.util.clients import (
+    get_database_manager_client,
+    get_notification_manager_client,
+)
+from backend.util.metrics import DiscordChannel, sentry_capture_error
+from backend.util.settings import Config
+
+logger = logging.getLogger(__name__)
+config = Config()
+
+
+class AccuracyMonitor:
+    """Monitor execution accuracy trends and send alerts for drops."""
+
+    def __init__(self, drop_threshold: float = 10.0):
+        self.config = config
+        self.notification_client = get_notification_manager_client()
+        self.database_client = get_database_manager_client()
+        self.drop_threshold = drop_threshold
+
+    def check_execution_accuracy_alerts(self) -> str:
+        """Check marketplace agents for accuracy drops and send alerts."""
+        try:
+            logger.info("Checking execution accuracy for marketplace agents")
+
+            # Get marketplace graphs using database client
+            graphs = self.database_client.get_marketplace_graphs_for_monitoring(
+                days_back=30, min_executions=10
+            )
+
+            alerts_found = 0
+
+            for graph_data in graphs:
+                result = self.database_client.get_accuracy_trends_and_alerts(
+                    graph_id=graph_data.graph_id,
+                    user_id=graph_data.user_id,
+                    days_back=21,  # 3 weeks
+                    drop_threshold=self.drop_threshold,
+                )
+
+                if result.alert:
+                    alert = result.alert
+
+                    # Get graph details for better alert info
+                    try:
+                        graph_info = self.database_client.get_graph_metadata(
+                            graph_id=alert.graph_id
+                        )
+                        graph_name = graph_info.name if graph_info else "Unknown Agent"
+                    except Exception:
+                        graph_name = "Unknown Agent"
+
+                    # Create detailed alert message
+                    alert_msg = (
+                        f"🚨 **AGENT ACCURACY DROP DETECTED**\n\n"
+                        f"**Agent:** {graph_name}\n"
+                        f"**Graph ID:** `{alert.graph_id}`\n"
+                        f"**Accuracy Drop:** {alert.drop_percent:.1f}%\n"
+                        f"**Recent Performance:**\n"
+                        f"  • 3-day average: {alert.three_day_avg:.1f}%\n"
+                        f"  • 7-day average: {alert.seven_day_avg:.1f}%\n"
+                    )
+
+                    if alert.user_id:
+                        alert_msg += f"**Owner:** {alert.user_id}\n"
+
+                    # Send individual alert for each agent (not batched)
+                    self.notification_client.discord_system_alert(
+                        alert_msg, DiscordChannel.PRODUCT
+                    )
+                    alerts_found += 1
+                    logger.warning(
+                        f"Sent accuracy alert for agent: {graph_name} ({alert.graph_id})"
+                    )
+
+            if alerts_found > 0:
+                return f"Alert sent for {alerts_found} agents with accuracy drops"
+
+            logger.info("No execution accuracy alerts detected")
+            return "No accuracy alerts detected"
+
+        except Exception as e:
+            logger.exception(f"Error checking execution accuracy alerts: {e}")
+
+            error = Exception(f"Error checking execution accuracy alerts: {e}")
+            msg = str(error)
+            sentry_capture_error(error)
+            self.notification_client.discord_system_alert(msg, DiscordChannel.PRODUCT)
+            return msg
+
+
+def report_execution_accuracy_alerts(drop_threshold: float = 10.0) -> str:
+    """
+    Check execution accuracy and send alerts if drops are detected.
+
+    Args:
+        drop_threshold: Percentage drop threshold to trigger alerts (default 10.0%)
+
+    Returns:
+        Status message indicating results of the check
+    """
+    monitor = AccuracyMonitor(drop_threshold=drop_threshold)
+    return monitor.check_execution_accuracy_alerts()
--- a/autogpt_platform/backend/backend/server/v2/admin/execution_analytics_routes.py
+++ b/autogpt_platform/backend/backend/server/v2/admin/execution_analytics_routes.py
@@ -8,6 +8,10 @@ from fastapi import APIRouter, HTTPException, Security
 from pydantic import BaseModel, Field

 from backend.blocks.llm import LlmModel
+from backend.data.analytics import (
+    AccuracyTrendsResponse,
+    get_accuracy_trends_and_alerts,
+)
 from backend.data.execution import (
    ExecutionStatus,
    GraphExecutionMeta,
@@ -83,6 +87,18 @@ class ExecutionAnalyticsConfig(BaseModel):
    recommended_model: str


+class AccuracyTrendsRequest(BaseModel):
+    graph_id: str = Field(..., description="Graph ID to analyze", min_length=1)
+    user_id: Optional[str] = Field(None, description="Optional user ID filter")
+    days_back: int = Field(30, description="Number of days to look back", ge=7, le=90)
+    drop_threshold: float = Field(
+        10.0, description="Alert threshold percentage", ge=1.0, le=50.0
+    )
+    include_historical: bool = Field(
+        False, description="Include historical data for charts"
+    )
+
+
 router = APIRouter(
    prefix="/admin",
    tags=["admin", "execution_analytics"],
@@ -419,3 +435,40 @@ async def _process_batch(
    return await asyncio.gather(
        *[process_single_execution(execution) for execution in executions]
    )
+
+
+@router.get(
+    "/execution_accuracy_trends",
+    response_model=AccuracyTrendsResponse,
+    summary="Get Execution Accuracy Trends and Alerts",
+)
+async def get_execution_accuracy_trends(
+    graph_id: str,
+    user_id: Optional[str] = None,
+    days_back: int = 30,
+    drop_threshold: float = 10.0,
+    include_historical: bool = False,
+    admin_user_id: str = Security(get_user_id),
+) -> AccuracyTrendsResponse:
+    """
+    Get execution accuracy trends with moving averages and alert detection.
+    Simple single-query approach.
+    """
+    logger.info(
+        f"Admin user {admin_user_id} requesting accuracy trends for graph {graph_id}"
+    )
+
+    try:
+        result = await get_accuracy_trends_and_alerts(
+            graph_id=graph_id,
+            days_back=days_back,
+            user_id=user_id,
+            drop_threshold=drop_threshold,
+            include_historical=include_historical,
+        )
+
+        return result
+
+    except Exception as e:
+        logger.exception(f"Error getting accuracy trends for graph {graph_id}: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
--- a/autogpt_platform/backend/backend/server/v2/executions/review/routes.py
+++ b/autogpt_platform/backend/backend/server/v2/executions/review/routes.py
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)


 router = APIRouter(
-    tags=["executions", "review", "private"],
+    tags=["v2", "executions", "review"],
    dependencies=[Security(autogpt_auth_lib.requires_user)],
 )

--- a/autogpt_platform/backend/backend/util/settings.py
+++ b/autogpt_platform/backend/backend/util/settings.py
@@ -185,6 +185,12 @@ class Config(UpdateTrackingModel["Config"], BaseSettings):
        description="Number of top blocks with most errors to show when no blocks exceed threshold (0 to disable).",
    )

+    # Execution Accuracy Monitoring
+    execution_accuracy_check_interval_hours: int = Field(
+        default=24,
+        description="Interval in hours between execution accuracy alert checks.",
+    )
+
    model_config = SettingsConfigDict(
        env_file=".env",
        extra="allow",
--- a/autogpt_platform/frontend/src/app/(platform)/admin/execution-analytics/components/ExecutionAnalyticsForm.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/admin/execution-analytics/components/ExecutionAnalyticsForm.tsx
@@ -1,6 +1,16 @@
 "use client";

 import { useState, useEffect } from "react";
+import {
+  LineChart,
+  Line,
+  XAxis,
+  YAxis,
+  CartesianGrid,
+  Tooltip,
+  Legend,
+  ResponsiveContainer,
+} from "recharts";
 import { Button } from "@/components/atoms/Button/Button";
 import { Input } from "@/components/__legacy__/ui/input";
 import { Label } from "@/components/__legacy__/ui/label";
@@ -18,9 +28,12 @@ import { useToast } from "@/components/molecules/Toast/use-toast";
 import {
  usePostV2GenerateExecutionAnalytics,
  useGetV2GetExecutionAnalyticsConfiguration,
+  useGetV2GetExecutionAccuracyTrendsAndAlerts,
 } from "@/app/api/__generated__/endpoints/admin/admin";
 import type { ExecutionAnalyticsRequest } from "@/app/api/__generated__/models/executionAnalyticsRequest";
 import type { ExecutionAnalyticsResponse } from "@/app/api/__generated__/models/executionAnalyticsResponse";
+import type { AccuracyTrendsResponse } from "@/app/api/__generated__/models/accuracyTrendsResponse";
+import type { AccuracyLatestData } from "@/app/api/__generated__/models/accuracyLatestData";

 // Use the generated type with minimal adjustment for form handling
 interface FormData extends Omit<ExecutionAnalyticsRequest, "created_after"> {
@@ -33,8 +46,133 @@ export function ExecutionAnalyticsForm() {
  const [results, setResults] = useState<ExecutionAnalyticsResponse | null>(
    null,
  );
+  const [trendsData, setTrendsData] = useState<AccuracyTrendsResponse | null>(
+    null,
+  );
  const { toast } = useToast();

+  // State for accuracy trends query parameters
+  const [accuracyParams, setAccuracyParams] = useState<{
+    graph_id: string;
+    user_id?: string;
+    days_back: number;
+    drop_threshold: number;
+    include_historical?: boolean;
+  } | null>(null);
+
+  // Use the generated API client for accuracy trends (GET)
+  const { data: accuracyApiResponse, error: accuracyError } =
+    useGetV2GetExecutionAccuracyTrendsAndAlerts(
+      accuracyParams || {
+        graph_id: "",
+        days_back: 30,
+        drop_threshold: 10.0,
+        include_historical: false,
+      },
+      {
+        query: {
+          enabled: !!accuracyParams?.graph_id,
+        },
+      },
+    );
+
+  // Update local state when data changes and handle success/error
+  useEffect(() => {
+    if (accuracyError) {
+      console.error("Failed to fetch trends:", accuracyError);
+      toast({
+        title: "Trends Error",
+        description:
+          (accuracyError as any)?.message || "Failed to fetch accuracy trends",
+        variant: "destructive",
+      });
+      return;
+    }
+
+    const data = accuracyApiResponse?.data;
+    if (data && "latest_data" in data) {
+      setTrendsData(data);
+
+      // Check for alerts
+      if (data.alert) {
+        toast({
+          title: "🚨 Accuracy Alert Detected",
+          description: `${data.alert.drop_percent.toFixed(1)}% accuracy drop detected for this agent`,
+          variant: "destructive",
+        });
+      }
+    }
+  }, [accuracyApiResponse, accuracyError, toast]);
+
+  // Chart component for accuracy trends
+  function AccuracyChart({ data }: { data: AccuracyLatestData[] }) {
+    const chartData = data.map((item) => ({
+      date: new Date(item.date).toLocaleDateString(),
+      "Daily Score": item.daily_score,
+      "3-Day Avg": item.three_day_avg,
+      "7-Day Avg": item.seven_day_avg,
+      "14-Day Avg": item.fourteen_day_avg,
+    }));
+
+    return (
+      <ResponsiveContainer width="100%" height={400}>
+        <LineChart
+          data={chartData}
+          margin={{ top: 5, right: 30, left: 20, bottom: 5 }}
+        >
+          <CartesianGrid strokeDasharray="3 3" />
+          <XAxis dataKey="date" />
+          <YAxis domain={[0, 100]} />
+          <Tooltip
+            formatter={(value) => [`${Number(value).toFixed(2)}%`, ""]}
+          />
+          <Legend />
+          <Line
+            type="monotone"
+            dataKey="Daily Score"
+            stroke="#3b82f6"
+            strokeWidth={2}
+            dot={{ r: 3 }}
+          />
+          <Line
+            type="monotone"
+            dataKey="3-Day Avg"
+            stroke="#10b981"
+            strokeWidth={2}
+            dot={{ r: 3 }}
+          />
+          <Line
+            type="monotone"
+            dataKey="7-Day Avg"
+            stroke="#f59e0b"
+            strokeWidth={2}
+            dot={{ r: 3 }}
+          />
+          <Line
+            type="monotone"
+            dataKey="14-Day Avg"
+            stroke="#8b5cf6"
+            strokeWidth={2}
+            dot={{ r: 3 }}
+          />
+        </LineChart>
+      </ResponsiveContainer>
+    );
+  }
+
+  // Function to fetch accuracy trends using generated API client
+  const fetchAccuracyTrends = (graphId: string, userId?: string) => {
+    if (!graphId.trim()) return;
+
+    setAccuracyParams({
+      graph_id: graphId.trim(),
+      user_id: userId?.trim() || undefined,
+      days_back: 30,
+      drop_threshold: 10.0,
+      include_historical: showAccuracyChart, // Include historical data when chart is enabled
+    });
+  };
+
  // Fetch configuration from API
  const {
    data: config,
@@ -50,6 +188,7 @@ export function ExecutionAnalyticsForm() {
        }
        const result = res.data;
        setResults(result);
+
        toast({
          title: "Analytics Generated",
          description: `Processed ${result.processed_executions} executions. ${result.successful_analytics} successful, ${result.failed_analytics} failed, ${result.skipped_executions} skipped.`,
@@ -58,11 +197,21 @@ export function ExecutionAnalyticsForm() {
      },
      onError: (error: any) => {
        console.error("Analytics generation error:", error);
+
+        const errorMessage =
+          error?.message || error?.detail || "An unexpected error occurred";
+        const isOpenAIError = errorMessage.includes(
+          "OpenAI API key not configured",
+        );
+
        toast({
-          title: "Analytics Generation Failed",
-          description:
-            error?.message || error?.detail || "An unexpected error occurred",
-          variant: "destructive",
+          title: isOpenAIError
+            ? "Analytics Generation Skipped"
+            : "Analytics Generation Failed",
+          description: isOpenAIError
+            ? "Analytics generation requires OpenAI configuration, but accuracy trends are still available above."
+            : errorMessage,
+          variant: isOpenAIError ? "default" : "destructive",
        });
      },
    },
@@ -77,6 +226,9 @@ export function ExecutionAnalyticsForm() {
    user_prompt: "", // Will use config default when empty
  });

+  // State for accuracy trends chart toggle
+  const [showAccuracyChart, setShowAccuracyChart] = useState(true);
+
  // Update form defaults when config loads
  useEffect(() => {
    if (config?.data && config.status === 200 && !formData.model_name) {
@@ -101,6 +253,11 @@ export function ExecutionAnalyticsForm() {

    setResults(null);

+    // Fetch accuracy trends if chart is enabled
+    if (showAccuracyChart) {
+      fetchAccuracyTrends(formData.graph_id, formData.user_id || undefined);
+    }
+
    // Prepare the request payload
    const payload: ExecutionAnalyticsRequest = {
      graph_id: formData.graph_id.trim(),
@@ -262,6 +419,18 @@ export function ExecutionAnalyticsForm() {
                </Label>
              </div>

+              {/* Show Accuracy Chart Checkbox */}
+              <div className="flex items-center space-x-2">
+                <Checkbox
+                  id="show_accuracy_chart"
+                  checked={showAccuracyChart}
+                  onCheckedChange={(checked) => setShowAccuracyChart(!!checked)}
+                />
+                <Label htmlFor="show_accuracy_chart" className="text-sm">
+                  Show accuracy trends chart and historical data visualization
+                </Label>
+              </div>
+
              {/* Custom System Prompt */}
              <div className="space-y-2">
                <Label htmlFor="system_prompt">
@@ -370,6 +539,98 @@ export function ExecutionAnalyticsForm() {
        </div>
      </form>

+      {/* Accuracy Trends Display */}
+      {trendsData && (
+        <div className="space-y-4">
+          <h3 className="text-lg font-semibold">Execution Accuracy Trends</h3>
+
+          {/* Alert Section */}
+          {trendsData.alert && (
+            <div className="rounded-lg border-l-4 border-red-500 bg-red-50 p-4">
+              <div className="flex items-start">
+                <span className="text-2xl">🚨</span>
+                <div className="ml-3 space-y-2">
+                  <h4 className="text-lg font-semibold text-red-800">
+                    Accuracy Alert Detected
+                  </h4>
+                  <p className="text-red-700">
+                    <strong>
+                      {trendsData.alert.drop_percent.toFixed(1)}% accuracy drop
+                    </strong>{" "}
+                    detected for agent{" "}
+                    <code className="rounded bg-red-100 px-1 text-sm">
+                      {formData.graph_id}
+                    </code>
+                  </p>
+                  <div className="space-y-1 text-sm text-red-600">
+                    <p>
+                      • 3-day average:{" "}
+                      <strong>
+                        {trendsData.alert.three_day_avg.toFixed(2)}%
+                      </strong>
+                    </p>
+                    <p>
+                      • 7-day average:{" "}
+                      <strong>
+                        {trendsData.alert.seven_day_avg.toFixed(2)}%
+                      </strong>
+                    </p>
+                    <p>
+                      • Detected at:{" "}
+                      <strong>
+                        {new Date(
+                          trendsData.alert.detected_at,
+                        ).toLocaleString()}
+                      </strong>
+                    </p>
+                  </div>
+                </div>
+              </div>
+            </div>
+          )}
+
+          {/* Latest Data Summary */}
+          <div className="grid grid-cols-2 gap-4 md:grid-cols-4">
+            <div className="rounded-lg border bg-white p-4 text-center">
+              <div className="text-2xl font-bold text-blue-600">
+                {trendsData.latest_data.daily_score?.toFixed(2) || "N/A"}
+              </div>
+              <div className="text-sm text-gray-600">Daily Score</div>
+            </div>
+            <div className="rounded-lg border bg-white p-4 text-center">
+              <div className="text-2xl font-bold text-green-600">
+                {trendsData.latest_data.three_day_avg?.toFixed(2) || "N/A"}
+              </div>
+              <div className="text-sm text-gray-600">3-Day Avg</div>
+            </div>
+            <div className="rounded-lg border bg-white p-4 text-center">
+              <div className="text-2xl font-bold text-orange-600">
+                {trendsData.latest_data.seven_day_avg?.toFixed(2) || "N/A"}
+              </div>
+              <div className="text-sm text-gray-600">7-Day Avg</div>
+            </div>
+            <div className="rounded-lg border bg-white p-4 text-center">
+              <div className="text-2xl font-bold text-purple-600">
+                {trendsData.latest_data.fourteen_day_avg?.toFixed(2) || "N/A"}
+              </div>
+              <div className="text-sm text-gray-600">14-Day Avg</div>
+            </div>
+          </div>
+
+          {/* Chart Section - only show when toggle is enabled and historical data exists */}
+          {showAccuracyChart && trendsData?.historical_data && (
+            <div className="mt-6">
+              <h4 className="mb-4 text-lg font-semibold">
+                Execution Accuracy Trends Chart
+              </h4>
+              <div className="rounded-lg border bg-white p-6">
+                <AccuracyChart data={trendsData.historical_data} />
+              </div>
+            </div>
+          )}
+        </div>
+      )}
+
      {results && <AnalyticsResultsTable results={results} />}
    </div>
  );
--- a/autogpt_platform/frontend/src/app/(platform)/admin/execution-analytics/page.tsx
+++ b/autogpt_platform/frontend/src/app/(platform)/admin/execution-analytics/page.tsx
@@ -17,12 +17,13 @@ function ExecutionAnalyticsDashboard() {
        </div>

        <div className="rounded-lg border bg-white p-6 shadow-sm">
-          <h2 className="mb-4 text-xl font-semibold">Analytics Generation</h2>
+          <h2 className="mb-4 text-xl font-semibold">
+            Execution Analytics & Accuracy Monitoring
+          </h2>
          <p className="mb-6 text-gray-600">
-            This tool will identify completed executions missing activity
-            summaries or success scores and generate them using AI. Only
-            executions that meet the criteria and are missing these fields will
-            be processed.
+            Generate missing activity summaries and success scores for agent
+            executions. After generation, accuracy trends and alerts will
+            automatically be displayed to help monitor agent health over time.
          </p>

          <Suspense
--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json
@@ -4072,16 +4072,84 @@
        "security": [{ "HTTPBearerJWT": [] }]
      }
    },
+    "/api/executions/admin/execution_accuracy_trends": {
+      "get": {
+        "tags": ["v2", "admin", "admin", "execution_analytics"],
+        "summary": "Get Execution Accuracy Trends and Alerts",
+        "description": "Get execution accuracy trends with moving averages and alert detection.\nSimple single-query approach.",
+        "operationId": "getV2Get execution accuracy trends and alerts",
+        "security": [{ "HTTPBearerJWT": [] }],
+        "parameters": [
+          {
+            "name": "graph_id",
+            "in": "query",
+            "required": true,
+            "schema": { "type": "string", "title": "Graph Id" }
+          },
+          {
+            "name": "user_id",
+            "in": "query",
+            "required": false,
+            "schema": {
+              "anyOf": [{ "type": "string" }, { "type": "null" }],
+              "title": "User Id"
+            }
+          },
+          {
+            "name": "days_back",
+            "in": "query",
+            "required": false,
+            "schema": { "type": "integer", "default": 30, "title": "Days Back" }
+          },
+          {
+            "name": "drop_threshold",
+            "in": "query",
+            "required": false,
+            "schema": {
+              "type": "number",
+              "default": 10.0,
+              "title": "Drop Threshold"
+            }
+          },
+          {
+            "name": "include_historical",
+            "in": "query",
+            "required": false,
+            "schema": {
+              "type": "boolean",
+              "default": false,
+              "title": "Include Historical"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/AccuracyTrendsResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Validation Error",
+            "content": {
+              "application/json": {
+                "schema": { "$ref": "#/components/schemas/HTTPValidationError" }
+              }
+            }
+          },
+          "401": {
+            "$ref": "#/components/responses/HTTP401NotAuthenticatedError"
+          }
+        }
+      }
+    },
    "/api/review/pending": {
      "get": {
-        "tags": [
-          "v2",
-          "executions",
-          "review",
-          "executions",
-          "review",
-          "private"
-        ],
+        "tags": ["v2", "executions", "review", "v2", "executions", "review"],
        "summary": "Get Pending Reviews",
        "description": "Get all pending reviews for the current user.\n\nRetrieves all reviews with status \"WAITING\" that belong to the authenticated user.\nResults are ordered by creation time (newest first).\n\nArgs:\n    user_id: Authenticated user ID from security dependency\n\nReturns:\n    List of pending review objects with status converted to typed literals\n\nRaises:\n    HTTPException: If authentication fails or database error occurs\n\nNote:\n    Reviews with invalid status values are logged as warnings but excluded\n    from results rather than failing the entire request.",
        "operationId": "getV2Get pending reviews",
@@ -4150,14 +4218,7 @@
    },
    "/api/review/execution/{graph_exec_id}": {
      "get": {
-        "tags": [
-          "v2",
-          "executions",
-          "review",
-          "executions",
-          "review",
-          "private"
-        ],
+        "tags": ["v2", "executions", "review", "v2", "executions", "review"],
        "summary": "Get Pending Reviews for Execution",
        "description": "Get all pending reviews for a specific graph execution.\n\nRetrieves all reviews with status \"WAITING\" for the specified graph execution\nthat belong to the authenticated user. Results are ordered by creation time\n(oldest first) to preserve review order within the execution.\n\nArgs:\n    graph_exec_id: ID of the graph execution to get reviews for\n    user_id: Authenticated user ID from security dependency\n\nReturns:\n    List of pending review objects for the specified execution\n\nRaises:\n    HTTPException:\n        - 403: If user doesn't own the graph execution\n        - 500: If authentication fails or database error occurs\n\nNote:\n    Only returns reviews owned by the authenticated user for security.\n    Reviews with invalid status are excluded with warning logs.",
        "operationId": "getV2Get pending reviews for execution",
@@ -4207,14 +4268,7 @@
    },
    "/api/review/action": {
      "post": {
-        "tags": [
-          "v2",
-          "executions",
-          "review",
-          "executions",
-          "review",
-          "private"
-        ],
+        "tags": ["v2", "executions", "review", "v2", "executions", "review"],
        "summary": "Process Review Action",
        "description": "Process reviews with approve or reject actions.",
        "operationId": "postV2ProcessReviewAction",
@@ -5429,6 +5483,90 @@
        "enum": ["ACTIVE", "REVOKED", "SUSPENDED"],
        "title": "APIKeyStatus"
      },
+      "AccuracyAlertData": {
+        "properties": {
+          "graph_id": { "type": "string", "title": "Graph Id" },
+          "user_id": {
+            "anyOf": [{ "type": "string" }, { "type": "null" }],
+            "title": "User Id"
+          },
+          "drop_percent": { "type": "number", "title": "Drop Percent" },
+          "three_day_avg": { "type": "number", "title": "Three Day Avg" },
+          "seven_day_avg": { "type": "number", "title": "Seven Day Avg" },
+          "detected_at": {
+            "type": "string",
+            "format": "date-time",
+            "title": "Detected At"
+          }
+        },
+        "type": "object",
+        "required": [
+          "graph_id",
+          "user_id",
+          "drop_percent",
+          "three_day_avg",
+          "seven_day_avg",
+          "detected_at"
+        ],
+        "title": "AccuracyAlertData",
+        "description": "Alert data when accuracy drops significantly."
+      },
+      "AccuracyLatestData": {
+        "properties": {
+          "date": { "type": "string", "format": "date-time", "title": "Date" },
+          "daily_score": {
+            "anyOf": [{ "type": "number" }, { "type": "null" }],
+            "title": "Daily Score"
+          },
+          "three_day_avg": {
+            "anyOf": [{ "type": "number" }, { "type": "null" }],
+            "title": "Three Day Avg"
+          },
+          "seven_day_avg": {
+            "anyOf": [{ "type": "number" }, { "type": "null" }],
+            "title": "Seven Day Avg"
+          },
+          "fourteen_day_avg": {
+            "anyOf": [{ "type": "number" }, { "type": "null" }],
+            "title": "Fourteen Day Avg"
+          }
+        },
+        "type": "object",
+        "required": [
+          "date",
+          "daily_score",
+          "three_day_avg",
+          "seven_day_avg",
+          "fourteen_day_avg"
+        ],
+        "title": "AccuracyLatestData",
+        "description": "Latest execution accuracy data point."
+      },
+      "AccuracyTrendsResponse": {
+        "properties": {
+          "latest_data": { "$ref": "#/components/schemas/AccuracyLatestData" },
+          "alert": {
+            "anyOf": [
+              { "$ref": "#/components/schemas/AccuracyAlertData" },
+              { "type": "null" }
+            ]
+          },
+          "historical_data": {
+            "anyOf": [
+              {
+                "items": { "$ref": "#/components/schemas/AccuracyLatestData" },
+                "type": "array"
+              },
+              { "type": "null" }
+            ],
+            "title": "Historical Data"
+          }
+        },
+        "type": "object",
+        "required": ["latest_data", "alert"],
+        "title": "AccuracyTrendsResponse",
+        "description": "Response model for accuracy trends and alerts."
+      },
      "AddUserCreditsResponse": {
        "properties": {
          "new_balance": { "type": "integer", "title": "New Balance" },