Added docs and more specific error handling

This commit is contained in:
Swifty
2025-11-20 11:33:24 +01:00
parent 5388a321c7
commit 0c0488ec0b
4 changed files with 152 additions and 10 deletions

View File

@@ -20,10 +20,36 @@ PRISMA_SCHEMA="postgres/schema.prisma"
# SQLAlchemy Configuration (for gradual migration from Prisma)
# Set to true to enable SQLAlchemy alongside Prisma (both ORMs coexist during migration)
ENABLE_SQLALCHEMY=false
# Connection Pool Configuration
# IMPORTANT: With 6 backend processes, total connections = 6 × (POOL_SIZE + MAX_OVERFLOW)
# Must stay under PostgreSQL max_connections (default: 100)
#
# Environment-specific recommendations:
# Development: POOL_SIZE=2-3, MAX_OVERFLOW=1-2 (lightweight, fast startup)
# Test/CI: POOL_SIZE=2, MAX_OVERFLOW=1 (minimal resources, parallel test safety)
# Production: POOL_SIZE=10-20, MAX_OVERFLOW=5-10 (handle real traffic and bursts)
#
# Default values below are suitable for production use:
SQLALCHEMY_POOL_SIZE=10
SQLALCHEMY_MAX_OVERFLOW=5
# Timeout Configuration
# POOL_TIMEOUT: How long to wait for an available connection from the pool (when all connections busy)
# CONNECT_TIMEOUT: How long to wait when establishing a NEW connection to PostgreSQL
#
# Environment-specific recommendations:
# Development: POOL_TIMEOUT=10-30s, CONNECT_TIMEOUT=5-10s
# Test/CI: POOL_TIMEOUT=5-10s, CONNECT_TIMEOUT=5-10s (fail fast)
# Production: POOL_TIMEOUT=30s, CONNECT_TIMEOUT=10-15s
#
# Default values below are suitable for production use:
SQLALCHEMY_POOL_TIMEOUT=30
SQLALCHEMY_CONNECT_TIMEOUT=10
# SQL Query Logging
# Set to true to log ALL SQL statements (very verbose, useful for debugging)
# Should always be false in production
SQLALCHEMY_ECHO=false
## ===== REQUIRED SERVICE CREDENTIALS ===== ##

View File

@@ -91,6 +91,9 @@ class DatabaseManager(AppService):
# Initialize SQLAlchemy if enabled (for gradual migration from Prisma)
if config.enable_sqlalchemy:
try:
from sqlalchemy.exc import DatabaseError, OperationalError
from sqlalchemy.exc import TimeoutError as SQLAlchemyTimeoutError
from backend.data import sqlalchemy as sa
engine = sa.create_engine()
@@ -101,9 +104,37 @@ class DatabaseManager(AppService):
f"(pool_size={config.sqlalchemy_pool_size}, "
f"max_overflow={config.sqlalchemy_max_overflow})"
)
except OperationalError as e:
logger.error(
f"[{self.service_name}] Failed to connect to database during SQLAlchemy initialization. "
f"Check database connection settings (host, port, credentials). "
f"Database URL: {config.database_url.split('@')[-1] if '@' in config.database_url else 'N/A'}. "
f"Error: {e}"
)
raise
except SQLAlchemyTimeoutError as e:
logger.error(
f"[{self.service_name}] Database connection timeout during SQLAlchemy initialization. "
f"Timeout setting: {config.sqlalchemy_connect_timeout}s. "
f"Check if database is accessible and increase timeout if needed. "
f"Error: {e}"
)
raise
except DatabaseError as e:
logger.error(
f"[{self.service_name}] Database error during SQLAlchemy initialization. "
f"Check database permissions and configuration. "
f"Error: {e}"
)
raise
except Exception as e:
logger.error(
f"[{self.service_name}] Failed to initialize SQLAlchemy: {e}"
f"[{self.service_name}] Unexpected error during SQLAlchemy initialization. "
f"Configuration: pool_size={config.sqlalchemy_pool_size}, "
f"max_overflow={config.sqlalchemy_max_overflow}, "
f"pool_timeout={config.sqlalchemy_pool_timeout}s. "
f"Error: {e}",
exc_info=True,
)
raise
@@ -115,13 +146,25 @@ class DatabaseManager(AppService):
# Dispose SQLAlchemy if it was enabled
if config.enable_sqlalchemy:
try:
from sqlalchemy.exc import DatabaseError, OperationalError
from backend.data import sqlalchemy as sa
await sa.dispose()
logger.info(f"[{self.service_name}] ✓ SQLAlchemy disposed")
except (OperationalError, DatabaseError) as e:
# Log as warning since disposal failures during shutdown are non-critical
logger.warning(
f"[{self.service_name}] Database error while disposing SQLAlchemy connections. "
f"This may leave connections open but won't affect shutdown. "
f"Error: {e}"
)
except Exception as e:
logger.warning(
f"[{self.service_name}] Error disposing SQLAlchemy: {e}"
f"[{self.service_name}] Unexpected error while disposing SQLAlchemy. "
f"Connection pool may not be cleanly released. "
f"Error: {e}",
exc_info=True,
)
await db.disconnect()

View File

@@ -82,6 +82,9 @@ async def lifespan_context(app: fastapi.FastAPI):
config = backend.util.settings.Config()
if config.enable_sqlalchemy:
try:
from sqlalchemy.exc import DatabaseError, OperationalError
from sqlalchemy.exc import TimeoutError as SQLAlchemyTimeoutError
from backend.data import sqlalchemy as sa
engine = sa.create_engine()
@@ -92,8 +95,38 @@ async def lifespan_context(app: fastapi.FastAPI):
f"(pool_size={config.sqlalchemy_pool_size}, "
f"max_overflow={config.sqlalchemy_max_overflow})"
)
except OperationalError as e:
logger.error(
f"Failed to connect to database during SQLAlchemy initialization. "
f"Check database connection settings (host, port, credentials). "
f"Database URL: {config.database_url.split('@')[-1] if '@' in config.database_url else 'N/A'}. "
f"Error: {e}"
)
raise
except SQLAlchemyTimeoutError as e:
logger.error(
f"Database connection timeout during SQLAlchemy initialization. "
f"Timeout setting: {config.sqlalchemy_connect_timeout}s. "
f"Check if database is accessible and increase timeout if needed. "
f"Error: {e}"
)
raise
except DatabaseError as e:
logger.error(
f"Database error during SQLAlchemy initialization. "
f"Check database permissions and configuration. "
f"Error: {e}"
)
raise
except Exception as e:
logger.error(f"Failed to initialize SQLAlchemy: {e}")
logger.error(
f"Unexpected error during SQLAlchemy initialization. "
f"Configuration: pool_size={config.sqlalchemy_pool_size}, "
f"max_overflow={config.sqlalchemy_max_overflow}, "
f"pool_timeout={config.sqlalchemy_pool_timeout}s. "
f"Error: {e}",
exc_info=True,
)
raise
# Configure thread pool for FastAPI sync operation performance
@@ -139,12 +172,26 @@ async def lifespan_context(app: fastapi.FastAPI):
# Dispose SQLAlchemy if it was enabled
if config.enable_sqlalchemy:
try:
from sqlalchemy.exc import DatabaseError, OperationalError
from backend.data import sqlalchemy as sa
await sa.dispose()
logger.info("✓ AgentServer: SQLAlchemy disposed")
except (OperationalError, DatabaseError) as e:
# Log as warning since disposal failures during shutdown are non-critical
logger.warning(
f"Database error while disposing SQLAlchemy connections. "
f"This may leave connections open but won't affect shutdown. "
f"Error: {e}"
)
except Exception as e:
logger.warning(f"Error disposing SQLAlchemy: {e}")
logger.warning(
f"Unexpected error while disposing SQLAlchemy. "
f"Connection pool may not be cleanly released. "
f"Error: {e}",
exc_info=True,
)
await backend.data.db.disconnect()

View File

@@ -285,8 +285,14 @@ class Config(UpdateTrackingModel["Config"], BaseSettings):
ge=1,
le=100,
description="Number of persistent connections in the SQLAlchemy pool. "
"Guidelines: REST API (high traffic) 10-20, Background workers 3-5. "
"Total across all services should not exceed PostgreSQL max_connections (default: 100).",
"Environment-specific recommendations: "
"Development: 2-3 (lightweight, fast startup), "
"Test/CI: 2 (minimal resources, avoid connection exhaustion in parallel tests), "
"Production: 10-20 for REST API (high traffic), 3-5 for background workers. "
"IMPORTANT: Total connections across ALL services (pool_size + max_overflow per service) "
"must not exceed PostgreSQL max_connections (default: 100). "
"With 6 processes in production (rest-api, executor, database-manager, scheduler, websocket, comms), "
"calculate: 6 × (pool_size + max_overflow) ≤ 100.",
)
sqlalchemy_max_overflow: int = Field(
@@ -294,22 +300,42 @@ class Config(UpdateTrackingModel["Config"], BaseSettings):
ge=0,
le=50,
description="Additional connections beyond pool_size when pool is exhausted. "
"Total max connections = pool_size + max_overflow.",
"Total max connections per service = pool_size + max_overflow. "
"Environment-specific recommendations: "
"Development: 1-2 (handles occasional bursts), "
"Test/CI: 1 (minimal extra connections), "
"Production: 5-10 (handles traffic spikes without exhausting pool). "
"Setting to 0 means strict pool limit (connections fail when pool is exhausted). "
"Higher values provide better burst handling but consume more database connections.",
)
sqlalchemy_pool_timeout: int = Field(
default=30,
ge=1,
le=300,
description="Seconds to wait for available connection before raising error. "
"If all connections are busy and max_overflow is reached, requests wait this long before failing.",
description="Seconds to wait for available connection from pool before raising TimeoutError. "
"This timeout applies ONLY when all connections (pool_size + max_overflow) are busy. "
"Environment-specific recommendations: "
"Development: 10-30s (generous for debugging), "
"Test/CI: 5-10s (fail fast in tests), "
"Production: 30s (balance between user experience and resource holding). "
"If you see frequent TimeoutErrors, either increase pool_size/max_overflow or investigate slow queries. "
"NOTE: This is different from sqlalchemy_connect_timeout (which applies when establishing new connections).",
)
sqlalchemy_connect_timeout: int = Field(
default=10,
ge=1,
le=60,
description="Seconds to wait when establishing new connection to PostgreSQL.",
description="Seconds to wait when establishing NEW connection to PostgreSQL database. "
"This timeout applies at the network/TCP level when creating connections (not when acquiring from pool). "
"Environment-specific recommendations: "
"Development: 5-10s (local database should connect quickly), "
"Test/CI: 5-10s (fail fast if database unavailable), "
"Production: 10-15s (account for network latency, especially with cloud databases). "
"If you see frequent connection timeout errors during startup, check database accessibility "
"and network connectivity. "
"NOTE: This is different from sqlalchemy_pool_timeout (which applies when waiting for available connections from pool).",
)
sqlalchemy_echo: bool = Field(