mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
feat(backend): standardize service health checks with UnhealthyServiceError (#10584)
This PR standardizes health check error handling across all services by introducing and using a consistent `UnhealthyServiceError` exception type. This improves monitoring, debugging, and service reliability by providing uniform error reporting when services are unhealthy. ### Changes 🏗️ - **Added `UnhealthyServiceError` class** in `backend/util/service.py`: - Custom exception for unhealthy service states - Includes service name in error message - Added to `EXCEPTION_MAPPING` for proper serialization - **Updated health checks across services** to use `UnhealthyServiceError`: - **Database service** (`backend/executor/database.py`): Replace `RuntimeError` with `UnhealthyServiceError` for database connection failures - **Scheduler service** (`backend/executor/scheduler.py`): Replace `RuntimeError` with `UnhealthyServiceError` for scheduler initialization and running state checks - **Notification service** (`backend/notifications/notifications.py`): - Replace `RuntimeError` with `UnhealthyServiceError` for RabbitMQ configuration issues - Added new `health_check()` method to verify RabbitMQ readiness - **REST API** (`backend/server/rest_api.py`): Replace `RuntimeError` with `UnhealthyServiceError` for database health checks - **Updated imports** across all affected files to include `UnhealthyServiceError` ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Verified health check endpoints return appropriate errors when services are unhealthy - [x] Confirmed services start up properly and health checks pass when healthy - [x] Tested error serialization through API responses - [x] Verified no breaking changes to existing functionality #### For configuration changes: - [x] `.env.example` is updated or already compatible with my changes - [x] `docker-compose.yml` is updated or already compatible with my changes - [x] I have included a list of my configuration changes in the PR description (under **Changes**) No configuration changes were made in this PR - only code changes to improve error handling consistency. --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -41,7 +41,13 @@ from backend.data.user import (
|
||||
get_user_notification_preference,
|
||||
update_user_integrations,
|
||||
)
|
||||
from backend.util.service import AppService, AppServiceClient, endpoint_to_sync, expose
|
||||
from backend.util.service import (
|
||||
AppService,
|
||||
AppServiceClient,
|
||||
UnhealthyServiceError,
|
||||
endpoint_to_sync,
|
||||
expose,
|
||||
)
|
||||
from backend.util.settings import Config
|
||||
|
||||
config = Config()
|
||||
@@ -75,7 +81,7 @@ class DatabaseManager(AppService):
|
||||
|
||||
def health_check(self) -> str:
|
||||
if not db.is_connected():
|
||||
raise RuntimeError("Database is not connected")
|
||||
raise UnhealthyServiceError("Database is not connected")
|
||||
return super().health_check()
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -31,7 +31,13 @@ from backend.util.cloud_storage import cleanup_expired_files_async
|
||||
from backend.util.exceptions import NotAuthorizedError, NotFoundError
|
||||
from backend.util.logging import PrefixFilter
|
||||
from backend.util.retry import func_retry
|
||||
from backend.util.service import AppService, AppServiceClient, endpoint_to_async, expose
|
||||
from backend.util.service import (
|
||||
AppService,
|
||||
AppServiceClient,
|
||||
UnhealthyServiceError,
|
||||
endpoint_to_async,
|
||||
expose,
|
||||
)
|
||||
from backend.util.settings import Config
|
||||
|
||||
|
||||
@@ -192,7 +198,7 @@ class Scheduler(AppService):
|
||||
def health_check(self) -> str:
|
||||
# Thread-safe health check with proper initialization handling
|
||||
if not hasattr(self, "scheduler"):
|
||||
raise RuntimeError("Scheduler is still initializing")
|
||||
raise UnhealthyServiceError("Scheduler is still initializing")
|
||||
|
||||
# Check if we're in the middle of cleanup
|
||||
if self.cleaned_up:
|
||||
@@ -200,8 +206,7 @@ class Scheduler(AppService):
|
||||
|
||||
# Normal operation - check if scheduler is running
|
||||
if not self.scheduler.running:
|
||||
logger.error(f"{self.service_name} the scheduler is not running!")
|
||||
raise RuntimeError("Scheduler is not running")
|
||||
raise UnhealthyServiceError("Scheduler is not running")
|
||||
|
||||
return super().health_check()
|
||||
|
||||
|
||||
@@ -31,7 +31,13 @@ from backend.util.clients import get_database_manager_async_client
|
||||
from backend.util.logging import TruncatedLogger
|
||||
from backend.util.metrics import discord_send_alert
|
||||
from backend.util.retry import continuous_retry
|
||||
from backend.util.service import AppService, AppServiceClient, endpoint_to_sync, expose
|
||||
from backend.util.service import (
|
||||
AppService,
|
||||
AppServiceClient,
|
||||
UnhealthyServiceError,
|
||||
endpoint_to_sync,
|
||||
expose,
|
||||
)
|
||||
from backend.util.settings import Settings
|
||||
|
||||
logger = TruncatedLogger(logging.getLogger(__name__), "[NotificationManager]")
|
||||
@@ -183,16 +189,24 @@ class NotificationManager(AppService):
|
||||
def rabbit(self) -> rabbitmq.AsyncRabbitMQ:
|
||||
"""Access the RabbitMQ service. Will raise if not configured."""
|
||||
if not hasattr(self, "rabbitmq_service") or not self.rabbitmq_service:
|
||||
raise RuntimeError("RabbitMQ not configured for this service")
|
||||
raise UnhealthyServiceError("RabbitMQ not configured for this service")
|
||||
return self.rabbitmq_service
|
||||
|
||||
@property
|
||||
def rabbit_config(self) -> rabbitmq.RabbitMQConfig:
|
||||
"""Access the RabbitMQ config. Will raise if not configured."""
|
||||
if not self.rabbitmq_config:
|
||||
raise RuntimeError("RabbitMQ not configured for this service")
|
||||
raise UnhealthyServiceError("RabbitMQ not configured for this service")
|
||||
return self.rabbitmq_config
|
||||
|
||||
def health_check(self) -> str:
|
||||
# Service is unhealthy if RabbitMQ is not ready
|
||||
if not hasattr(self, "rabbitmq_service") or not self.rabbitmq_service:
|
||||
raise UnhealthyServiceError("RabbitMQ not configured for this service")
|
||||
if not self.rabbitmq_service.is_ready:
|
||||
raise UnhealthyServiceError("RabbitMQ channel is not ready")
|
||||
return super().health_check()
|
||||
|
||||
@classmethod
|
||||
def get_port(cls) -> int:
|
||||
return settings.config.notification_service_port
|
||||
|
||||
@@ -41,6 +41,7 @@ from backend.server.external.api import external_app
|
||||
from backend.server.middleware.security import SecurityHeadersMiddleware
|
||||
from backend.util import json
|
||||
from backend.util.cloud_storage import shutdown_cloud_storage_handler
|
||||
from backend.util.service import UnhealthyServiceError
|
||||
|
||||
settings = backend.util.settings.Settings()
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -232,7 +233,7 @@ app.mount("/external-api", external_app)
|
||||
@app.get(path="/health", tags=["health"], dependencies=[])
|
||||
async def health():
|
||||
if not backend.data.db.is_connected():
|
||||
raise RuntimeError("Database is not connected")
|
||||
raise UnhealthyServiceError("Database is not connected")
|
||||
return {"status": "healthy"}
|
||||
|
||||
|
||||
|
||||
@@ -97,6 +97,18 @@ class RemoteCallError(BaseModel):
|
||||
args: Optional[Tuple[Any, ...]] = None
|
||||
|
||||
|
||||
class UnhealthyServiceError(ValueError):
|
||||
def __init__(self, message: str = "Service is unhealthy", log: bool = True):
|
||||
msg = f"[{get_service_name()}] - please check the service health."
|
||||
super().__init__(msg)
|
||||
self.message = msg
|
||||
if log:
|
||||
logger.error(self.message)
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
|
||||
EXCEPTION_MAPPING = {
|
||||
e.__name__: e
|
||||
for e in [
|
||||
@@ -104,6 +116,7 @@ EXCEPTION_MAPPING = {
|
||||
RuntimeError,
|
||||
TimeoutError,
|
||||
ConnectionError,
|
||||
UnhealthyServiceError,
|
||||
*[
|
||||
ErrorType
|
||||
for _, ErrorType in inspect.getmembers(exceptions)
|
||||
|
||||
Reference in New Issue
Block a user