feat(backend): Expose execution prometheus metrics (#9866)

Currently, we have no visibility on the state of the execution manager, the scope of this PR is to open up the observability of it by exposing Prometheus metrics. ### Changes 🏗️ Re-use the execution manager port to expose the Prometheus metrics. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Hit /metrics on 8002 port
2026-04-08 03:00:28 -04:00 · 2025-04-24 09:48:38 +02:00
parent eb6a0b34e1
commit 7fbe135ec8
4 changed files with 45 additions and 6 deletions
--- a/autogpt_platform/backend/backend/executor/manager.py
+++ b/autogpt_platform/backend/backend/executor/manager.py
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
    from backend.notifications.notifications import NotificationManager

 from autogpt_libs.utils.cache import clear_thread_cache, thread_cached
+from prometheus_client import Gauge, start_http_server

 from backend.blocks.agent import AgentExecutorBlock
 from backend.data import redis
@@ -68,6 +69,17 @@ from backend.util.settings import Settings
 logger = logging.getLogger(__name__)
 settings = Settings()

+active_runs_gauge = Gauge(
+    "execution_manager_active_runs", "Number of active graph runs"
+)
+pool_size_gauge = Gauge(
+    "execution_manager_pool_size", "Maximum number of graph workers"
+)
+utilization_gauge = Gauge(
+    "execution_manager_utilization_ratio",
+    "Ratio of active graph runs to max graph workers",
+)
+

 class LogMetadata:
    def __init__(
@@ -898,14 +910,21 @@ class ExecutionManager(AppProcess):
        self.running = True
        self.active_graph_runs: dict[str, tuple[Future, threading.Event]] = {}

-    @classmethod
-    def get_port(cls) -> int:
-        return settings.config.execution_manager_port
-
    def run(self):
+        pool_size_gauge.set(self.pool_size)
+        active_runs_gauge.set(0)
+        utilization_gauge.set(0)
        retry_count_max = settings.config.execution_manager_loop_max_retry
        retry_count = 0

+        self.metrics_server = threading.Thread(
+            target=start_http_server,
+            args=(settings.config.execution_manager_port,),
+            daemon=True,
+        )
+        self.metrics_server.start()
+        logger.info(f"[{self.service_name}] Starting execution manager...")
+
        for retry_count in range(retry_count_max):
            try:
                self._run()
@@ -1023,11 +1042,15 @@ class ExecutionManager(AppProcess):
            Executor.on_graph_execution, graph_exec_entry, cancel_event
        )
        self.active_graph_runs[graph_exec_id] = (future, cancel_event)
+        active_runs_gauge.set(len(self.active_graph_runs))
+        utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)

        def _on_run_done(f: Future):
            logger.info(f"[{self.service_name}] Run completed for {graph_exec_id}")
            try:
                self.active_graph_runs.pop(graph_exec_id, None)
+                active_runs_gauge.set(len(self.active_graph_runs))
+                utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
                if f.exception():
                    logger.error(
                        f"[{self.service_name}] Execution for {graph_exec_id} failed: {f.exception()}"
--- a/autogpt_platform/backend/poetry.lock
+++ b/autogpt_platform/backend/poetry.lock
@@ -3568,6 +3568,21 @@ files = [
 [package.dependencies]
 tqdm = "*"

+[[package]]
+name = "prometheus-client"
+version = "0.21.1"
+description = "Python client for the Prometheus monitoring system."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "prometheus_client-0.21.1-py3-none-any.whl", hash = "sha256:594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301"},
+    {file = "prometheus_client-0.21.1.tar.gz", hash = "sha256:252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb"},
+]
+
+[package.extras]
+twisted = ["twisted"]
+
 [[package]]
 name = "propcache"
 version = "0.2.1"
@@ -6310,4 +6325,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "781f77ec77cfce78b34fb57063dcc81df8e9c5a4be9a644033a0c197e0063730"
+content-hash = "29ccee704d8296c57156daab98bb0cbbf5a43e83526b7f08a14c91fb7a4898f4"
--- a/autogpt_platform/backend/pyproject.toml
+++ b/autogpt_platform/backend/pyproject.toml
@@ -64,6 +64,7 @@ websockets = "^14.2"
 youtube-transcript-api = "^0.6.2"
 zerobouncesdk = "^1.1.1"
 # NOTE: please insert new dependencies in their alphabetical location
+prometheus-client = "^0.21.1"

 [tool.poetry.group.dev.dependencies]
 aiohappyeyeballs = "^2.6.1"
--- a/autogpt_platform/docker-compose.platform.yml
+++ b/autogpt_platform/docker-compose.platform.yml
@@ -142,7 +142,7 @@ services:
      - NOTIFICATIONMANAGER_HOST=rest_server
      - ENCRYPTION_KEY=dvziYgz0KSK8FENhju0ZYi8-fRTfAdlz6YLhdB_jhNw= # DO NOT USE IN PRODUCTION!!
    ports:
-      - "8002:8000"
+      - "8002:8002"
    networks:
      - app-network