mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(backend): Expose execution prometheus metrics (#9866)
Currently, we have no visibility on the state of the execution manager, the scope of this PR is to open up the observability of it by exposing Prometheus metrics. ### Changes 🏗️ Re-use the execution manager port to expose the Prometheus metrics. ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Hit /metrics on 8002 port
This commit is contained in:
@@ -30,6 +30,7 @@ if TYPE_CHECKING:
|
||||
from backend.notifications.notifications import NotificationManager
|
||||
|
||||
from autogpt_libs.utils.cache import clear_thread_cache, thread_cached
|
||||
from prometheus_client import Gauge, start_http_server
|
||||
|
||||
from backend.blocks.agent import AgentExecutorBlock
|
||||
from backend.data import redis
|
||||
@@ -68,6 +69,17 @@ from backend.util.settings import Settings
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = Settings()
|
||||
|
||||
active_runs_gauge = Gauge(
|
||||
"execution_manager_active_runs", "Number of active graph runs"
|
||||
)
|
||||
pool_size_gauge = Gauge(
|
||||
"execution_manager_pool_size", "Maximum number of graph workers"
|
||||
)
|
||||
utilization_gauge = Gauge(
|
||||
"execution_manager_utilization_ratio",
|
||||
"Ratio of active graph runs to max graph workers",
|
||||
)
|
||||
|
||||
|
||||
class LogMetadata:
|
||||
def __init__(
|
||||
@@ -898,14 +910,21 @@ class ExecutionManager(AppProcess):
|
||||
self.running = True
|
||||
self.active_graph_runs: dict[str, tuple[Future, threading.Event]] = {}
|
||||
|
||||
@classmethod
|
||||
def get_port(cls) -> int:
|
||||
return settings.config.execution_manager_port
|
||||
|
||||
def run(self):
|
||||
pool_size_gauge.set(self.pool_size)
|
||||
active_runs_gauge.set(0)
|
||||
utilization_gauge.set(0)
|
||||
retry_count_max = settings.config.execution_manager_loop_max_retry
|
||||
retry_count = 0
|
||||
|
||||
self.metrics_server = threading.Thread(
|
||||
target=start_http_server,
|
||||
args=(settings.config.execution_manager_port,),
|
||||
daemon=True,
|
||||
)
|
||||
self.metrics_server.start()
|
||||
logger.info(f"[{self.service_name}] Starting execution manager...")
|
||||
|
||||
for retry_count in range(retry_count_max):
|
||||
try:
|
||||
self._run()
|
||||
@@ -1023,11 +1042,15 @@ class ExecutionManager(AppProcess):
|
||||
Executor.on_graph_execution, graph_exec_entry, cancel_event
|
||||
)
|
||||
self.active_graph_runs[graph_exec_id] = (future, cancel_event)
|
||||
active_runs_gauge.set(len(self.active_graph_runs))
|
||||
utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
|
||||
|
||||
def _on_run_done(f: Future):
|
||||
logger.info(f"[{self.service_name}] Run completed for {graph_exec_id}")
|
||||
try:
|
||||
self.active_graph_runs.pop(graph_exec_id, None)
|
||||
active_runs_gauge.set(len(self.active_graph_runs))
|
||||
utilization_gauge.set(len(self.active_graph_runs) / self.pool_size)
|
||||
if f.exception():
|
||||
logger.error(
|
||||
f"[{self.service_name}] Execution for {graph_exec_id} failed: {f.exception()}"
|
||||
|
||||
17
autogpt_platform/backend/poetry.lock
generated
17
autogpt_platform/backend/poetry.lock
generated
@@ -3568,6 +3568,21 @@ files = [
|
||||
[package.dependencies]
|
||||
tqdm = "*"
|
||||
|
||||
[[package]]
|
||||
name = "prometheus-client"
|
||||
version = "0.21.1"
|
||||
description = "Python client for the Prometheus monitoring system."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "prometheus_client-0.21.1-py3-none-any.whl", hash = "sha256:594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301"},
|
||||
{file = "prometheus_client-0.21.1.tar.gz", hash = "sha256:252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
twisted = ["twisted"]
|
||||
|
||||
[[package]]
|
||||
name = "propcache"
|
||||
version = "0.2.1"
|
||||
@@ -6310,4 +6325,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "781f77ec77cfce78b34fb57063dcc81df8e9c5a4be9a644033a0c197e0063730"
|
||||
content-hash = "29ccee704d8296c57156daab98bb0cbbf5a43e83526b7f08a14c91fb7a4898f4"
|
||||
|
||||
@@ -64,6 +64,7 @@ websockets = "^14.2"
|
||||
youtube-transcript-api = "^0.6.2"
|
||||
zerobouncesdk = "^1.1.1"
|
||||
# NOTE: please insert new dependencies in their alphabetical location
|
||||
prometheus-client = "^0.21.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
aiohappyeyeballs = "^2.6.1"
|
||||
|
||||
@@ -142,7 +142,7 @@ services:
|
||||
- NOTIFICATIONMANAGER_HOST=rest_server
|
||||
- ENCRYPTION_KEY=dvziYgz0KSK8FENhju0ZYi8-fRTfAdlz6YLhdB_jhNw= # DO NOT USE IN PRODUCTION!!
|
||||
ports:
|
||||
- "8002:8000"
|
||||
- "8002:8002"
|
||||
networks:
|
||||
- app-network
|
||||
|
||||
|
||||
Reference in New Issue
Block a user