feat(backend): Expose execution prometheus metrics (#9866)

Currently, we have no visibility on the state of the execution manager,
the scope of this PR is to open up the observability of it by exposing
Prometheus metrics.

### Changes 🏗️

Re-use the execution manager port to expose the Prometheus metrics.

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  - [x] Hit /metrics on 8002 port
This commit is contained in:
Zamil Majdy
2025-04-24 09:48:38 +02:00
committed by GitHub
parent eb6a0b34e1
commit 7fbe135ec8
4 changed files with 45 additions and 6 deletions

View File

@@ -30,6 +30,7 @@ if TYPE_CHECKING:
from backend.notifications.notifications import NotificationManager
from autogpt_libs.utils.cache import clear_thread_cache, thread_cached
from prometheus_client import Gauge, start_http_server
from backend.blocks.agent import AgentExecutorBlock
from backend.data import redis
@@ -68,6 +69,17 @@ from backend.util.settings import Settings
logger = logging.getLogger(__name__)
settings = Settings()
active_runs_gauge = Gauge(
"execution_manager_active_runs", "Number of active graph runs"
)
pool_size_gauge = Gauge(
"execution_manager_pool_size", "Maximum number of graph workers"
)
utilization_gauge = Gauge(
"execution_manager_utilization_ratio",
"Ratio of active graph runs to max graph workers",
)
class LogMetadata:
def __init__(
@@ -898,14 +910,21 @@ class ExecutionManager(AppProcess):
self.running = True
self.active_graph_runs: dict[str, tuple[Future, threading.Event]] = {}
@classmethod
def get_port(cls) -> int:
return settings.config.execution_manager_port
def run(self):
pool_size_gauge.set(self.pool_size)
active_runs_gauge.set(0)
utilization_gauge.set(0)
retry_count_max = settings.config.execution_manager_loop_max_retry
retry_count = 0
self.metrics_server = threading.Thread(
target=start_http_server,
args=(settings.config.execution_manager_port,),
daemon=True,
)
self.metrics_server.start()
logger.info(f"[{self.service_name}] Starting execution manager...")
for retry_count in range(retry_count_max):
try:
self._run()
@@ -1023,11 +1042,15 @@ class ExecutionManager(AppProcess):
Executor.on_graph_execution, graph_exec_entry, cancel_event
)
self.active_graph_runs[graph_exec_id] = (future, cancel_event)
active_runs_gauge.set(len(self.active_graph_runs))
utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
def _on_run_done(f: Future):
logger.info(f"[{self.service_name}] Run completed for {graph_exec_id}")
try:
self.active_graph_runs.pop(graph_exec_id, None)
active_runs_gauge.set(len(self.active_graph_runs))
utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
if f.exception():
logger.error(
f"[{self.service_name}] Execution for {graph_exec_id} failed: {f.exception()}"

View File

@@ -3568,6 +3568,21 @@ files = [
[package.dependencies]
tqdm = "*"
[[package]]
name = "prometheus-client"
version = "0.21.1"
description = "Python client for the Prometheus monitoring system."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "prometheus_client-0.21.1-py3-none-any.whl", hash = "sha256:594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301"},
{file = "prometheus_client-0.21.1.tar.gz", hash = "sha256:252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb"},
]
[package.extras]
twisted = ["twisted"]
[[package]]
name = "propcache"
version = "0.2.1"
@@ -6310,4 +6325,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "781f77ec77cfce78b34fb57063dcc81df8e9c5a4be9a644033a0c197e0063730"
content-hash = "29ccee704d8296c57156daab98bb0cbbf5a43e83526b7f08a14c91fb7a4898f4"

View File

@@ -64,6 +64,7 @@ websockets = "^14.2"
youtube-transcript-api = "^0.6.2"
zerobouncesdk = "^1.1.1"
# NOTE: please insert new dependencies in their alphabetical location
prometheus-client = "^0.21.1"
[tool.poetry.group.dev.dependencies]
aiohappyeyeballs = "^2.6.1"

View File

@@ -142,7 +142,7 @@ services:
- NOTIFICATIONMANAGER_HOST=rest_server
- ENCRYPTION_KEY=dvziYgz0KSK8FENhju0ZYi8-fRTfAdlz6YLhdB_jhNw= # DO NOT USE IN PRODUCTION!!
ports:
- "8002:8000"
- "8002:8002"
networks:
- app-network