fix(backend/scheduler): Reconfigure scheduling setting & Add more logging on execution scheduling logic

2026-04-08 03:00:28 -04:00 · 2025-08-08 19:27:30 +07:00
parent de7b6b503f
commit a28b2cf04f
2 changed files with 52 additions and 3 deletions
--- a/autogpt_platform/backend/backend/executor/scheduler.py
+++ b/autogpt_platform/backend/backend/executor/scheduler.py
@@ -6,7 +6,12 @@ from enum import Enum
 from typing import Optional
 from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

-from apscheduler.events import EVENT_JOB_ERROR, EVENT_JOB_EXECUTED
+from apscheduler.events import (
+    EVENT_JOB_ERROR,
+    EVENT_JOB_EXECUTED,
+    EVENT_JOB_MAX_INSTANCES,
+    EVENT_JOB_MISSED,
+)
 from apscheduler.job import Job as JobObj
 from apscheduler.jobstores.memory import MemoryJobStore
 from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
@@ -74,11 +79,30 @@ SCHEDULER_OPERATION_TIMEOUT_SECONDS = 300  # 5 minutes for scheduler operations
 def job_listener(event):
    """Logs job execution outcomes for better monitoring."""
    if event.exception:
-        logger.error(f"Job {event.job_id} failed.")
+        logger.error(
+            f"Job {event.job_id} failed: {type(event.exception).__name__}: {event.exception}"
+        )
    else:
        logger.info(f"Job {event.job_id} completed successfully.")


+def job_missed_listener(event):
+    """Logs when jobs are missed due to scheduling issues."""
+    logger.warning(
+        f"Job {event.job_id} was missed at scheduled time {event.scheduled_run_time}. "
+        f"This can happen if the scheduler is overloaded or if previous executions are still running."
+    )
+
+
+def job_max_instances_listener(event):
+    """Logs when jobs hit max instances limit."""
+    logger.warning(
+        f"Job {event.job_id} execution was SKIPPED - max instances limit reached. "
+        f"Previous execution(s) are still running. "
+        f"Consider increasing max_instances or check why previous executions are taking too long."
+    )
+
+
 _event_loop: asyncio.AbstractEventLoop | None = None
 _event_loop_thread: threading.Thread | None = None

@@ -95,7 +119,11 @@ def run_async(coro, timeout: float = SCHEDULER_OPERATION_TIMEOUT_SECONDS):
    """Run a coroutine in the shared event loop and wait for completion."""
    loop = get_event_loop()
    future = asyncio.run_coroutine_threadsafe(coro, loop)
-    return future.result(timeout=timeout)
+    try:
+        return future.result(timeout=timeout)
+    except Exception as e:
+        logger.error(f"Async operation failed: {type(e).__name__}: {e}")
+        raise


 def execute_graph(**kwargs):
@@ -225,7 +253,18 @@ class Scheduler(AppService):
        _event_loop_thread.start()

        db_schema, db_url = _extract_schema_from_url(os.getenv("DIRECT_URL"))
+        # Configure executors to limit concurrency without skipping jobs
+        from apscheduler.executors.pool import ThreadPoolExecutor
+
        self.scheduler = BlockingScheduler(
+            executors={
+                "default": ThreadPoolExecutor(max_workers=10),  # Max 10 concurrent jobs
+            },
+            job_defaults={
+                "coalesce": True,  # Skip redundant missed jobs - just run the latest
+                "max_instances": 1000,  # Effectively unlimited - never drop executions
+                "misfire_grace_time": None,  # No time limit for missed jobs
+            },
            jobstores={
                Jobstores.EXECUTION.value: SQLAlchemyJobStore(
                    engine=create_engine(
@@ -305,6 +344,8 @@ class Scheduler(AppService):
            )

        self.scheduler.add_listener(job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
+        self.scheduler.add_listener(job_missed_listener, EVENT_JOB_MISSED)
+        self.scheduler.add_listener(job_max_instances_listener, EVENT_JOB_MAX_INSTANCES)
        self.scheduler.start()

    def cleanup(self):
--- a/autogpt_platform/backend/backend/executor/utils.py
+++ b/autogpt_platform/backend/backend/executor/utils.py
@@ -879,11 +879,19 @@ async def add_graph_execution(
        graph_exec_entry = graph_exec.to_graph_execution_entry()
        if nodes_input_masks:
            graph_exec_entry.nodes_input_masks = nodes_input_masks
+
+        logger.info(
+            f"Created graph execution #{graph_exec.id} for graph "
+            f"#{graph_id} with {len(starting_nodes_input)} starting nodes. "
+            f"Now publishing to execution queue."
+        )
+
        await queue.publish_message(
            routing_key=GRAPH_EXECUTION_ROUTING_KEY,
            message=graph_exec_entry.model_dump_json(),
            exchange=GRAPH_EXECUTION_EXCHANGE,
        )
+        logger.info(f"Published execution {graph_exec.id} to RabbitMQ queue")

        bus = get_async_execution_event_bus()
        await bus.publish(graph_exec)