[Refactor, Fix]: Agent controller state/metrics management (#9012)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-10 07:18:10 -05:00 · 2025-06-16 11:24:13 -04:00
parent cbe32a1a12
commit 2fd1fdcd7e
27 changed files with 1404 additions and 667 deletions
--- a/openhands/controller/state/control_flags.py
+++ b/openhands/controller/state/control_flags.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Generic, TypeVar
+
+T = TypeVar(
+    'T', int, float
+)  # Type for the value (int for iterations, float for budget)
+
+
+
+@dataclass
+class ControlFlag(Generic[T]):
+    """Base class for control flags that manage limits and state transitions."""
+
+    limit_increase_amount: T
+    current_value: T
+    max_value: T
+    headless_mode: bool = False
+    _hit_limit: bool = False
+
+    def reached_limit(self) -> bool:
+        """Check if the limit has been reached.
+
+        Returns:
+            bool: True if the limit has been reached, False otherwise.
+        """
+        raise NotImplementedError
+
+    def increase_limit(self, headless_mode: bool) -> None:
+        """Expand the limit when needed."""
+        raise NotImplementedError
+
+
+    def step(self):
+        """Determine the next state based on the current state and mode.
+
+        Returns:
+            ControlFlagState: The next state.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class IterationControlFlag(ControlFlag[int]):
+    """Control flag for managing iteration limits."""
+
+    def reached_limit(self) -> bool:
+        """Check if the iteration limit has been reached."""
+        self._hit_limit = self.current_value >= self.max_value
+        return self._hit_limit
+
+    def increase_limit(self, headless_mode: bool) -> None:
+        """Expand the iteration limit by adding the initial value."""
+        if not headless_mode and self._hit_limit:
+            self.max_value += self.limit_increase_amount
+            self._hit_limit = False
+
+
+    def step(self):
+        if self.reached_limit():
+            raise RuntimeError(
+                f'Agent reached maximum iteration. '
+                f'Current iteration: {self.current_value}, max iteration: {self.max_value}'
+            )
+
+        # Increment the current value
+        self.current_value += 1
+
+
+
+
+
+@dataclass
+class BudgetControlFlag(ControlFlag[float]):
+    """Control flag for managing budget limits."""
+
+    def reached_limit(self) -> bool:
+        """Check if the budget limit has been reached."""
+        self._hit_limit = self.current_value >= self.max_value
+        return self._hit_limit
+
+    def increase_limit(self, headless_mode) -> None:
+        """Expand the budget limit by adding the initial value to the current value."""
+        if self._hit_limit:
+            self.max_value = self.current_value + self.limit_increase_amount
+            self._hit_limit = False
+
+    def step(self):
+        """Check if we've reached the limit and update state accordingly.
+
+        Note: Unlike IterationControlFlag, this doesn't increment the value
+        as the budget is updated externally.
+        """
+        if self.reached_limit():
+            current_str = f'{self.current_value:.2f}'
+            max_str = f'{self.max_value:.2f}'
+            raise RuntimeError(
+                f'Agent reached maximum budget for conversation.'
+                f'Current budget: {current_str}, max budget: {max_str}'
+            )
+
+
+
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -8,6 +8,10 @@ from enum import Enum
 from typing import Any

 import openhands
+from openhands.controller.state.control_flags import (
+    BudgetControlFlag,
+    IterationControlFlag,
+)
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema import AgentState
 from openhands.events.action import (
@@ -20,7 +24,15 @@ from openhands.memory.view import View
 from openhands.storage.files import FileStore
 from openhands.storage.locations import get_conversation_agent_state_filename

+RESUMABLE_STATES = [
+    AgentState.RUNNING,
+    AgentState.PAUSED,
+    AgentState.AWAITING_USER_INPUT,
+    AgentState.FINISHED,
+]

+
+# NOTE: this is deprecated
 class TrafficControlState(str, Enum):
    # default state, no rate limiting
    NORMAL = 'normal'
@@ -32,14 +44,6 @@ class TrafficControlState(str, Enum):
    PAUSED = 'paused'


-RESUMABLE_STATES = [
-    AgentState.RUNNING,
-    AgentState.PAUSED,
-    AgentState.AWAITING_USER_INPUT,
-    AgentState.FINISHED,
-]
-
-
@dataclass
 class State:
    """
@@ -75,35 +79,43 @@ class State:
    """

    session_id: str = ''
-    # global iteration for the current task
-    iteration: int = 0
-    # local iteration for the current subtask
-    local_iteration: int = 0
-    # max number of iterations for the current task
-    max_iterations: int = 100
+    iteration_flag: IterationControlFlag = field(
+        default_factory=lambda: IterationControlFlag(
+            limit_increase_amount=100, current_value=0, max_value=100
+        )
+    )
+    budget_flag: BudgetControlFlag | None = None
    confirmation_mode: bool = False
    history: list[Event] = field(default_factory=list)
    inputs: dict = field(default_factory=dict)
    outputs: dict = field(default_factory=dict)
    agent_state: AgentState = AgentState.LOADING
    resume_state: AgentState | None = None
-    traffic_control_state: TrafficControlState = TrafficControlState.NORMAL
    # global metrics for the current task
    metrics: Metrics = field(default_factory=Metrics)
-    # local metrics for the current subtask
-    local_metrics: Metrics = field(default_factory=Metrics)
    # root agent has level 0, and every delegate increases the level by one
    delegate_level: int = 0
    # start_id and end_id track the range of events in history
    start_id: int = -1
    end_id: int = -1

-    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
-    # NOTE: This will never be used by the controller, but it can be used by different
+    parent_metrics_snapshot: Metrics | None = None
+    parent_iteration: int = 100
+
+    # NOTE: this is used by the controller to track parent's metrics snapshot before delegation
    # evaluation tasks to store extra data needed to track the progress/state of the task.
    extra_data: dict[str, Any] = field(default_factory=dict)
    last_error: str = ''

+    # NOTE: deprecated args, kept here temporarily for backwards compatability
+    # Will be remove in 30 days
+    iteration: int | None = None
+    local_iteration: int | None = None
+    max_iterations: int | None = None
+    traffic_control_state: TrafficControlState | None = None
+    local_metrics: Metrics | None = None
+    delegates: dict[tuple[int, int], tuple[str, str]] | None = None
+
    def save_to_session(
        self, sid: str, file_store: FileStore, user_id: str | None
    ) -> None:
@@ -165,6 +177,10 @@ class State:

        # first state after restore
        state.agent_state = AgentState.LOADING
+
+        # We don't need to clean up deprecated fields here
+        # They will be handled by __getstate__ when the state is saved again
+
        return state

    def __getstate__(self) -> dict:
@@ -177,15 +193,52 @@ class State:
        state.pop('_history_checksum', None)
        state.pop('_view', None)

+        # Remove deprecated fields before pickling
+        state.pop('iteration', None)
+        state.pop('local_iteration', None)
+        state.pop('max_iterations', None)
+        state.pop('traffic_control_state', None)
+        state.pop('local_metrics', None)
+        state.pop('delegates', None)
+
        return state

    def __setstate__(self, state: dict) -> None:
+        # Check if we're restoring from an older version (before control flags)
+        is_old_version = 'iteration' in state
+
+        # Convert old iteration tracking to new iteration_flag if needed
+        if is_old_version:
+            # Create iteration_flag from old values
+            max_iterations = state.get('max_iterations', 100)
+            current_iteration = state.get('iteration', 0)
+
+            # Add the iteration_flag to the state
+            state['iteration_flag'] = IterationControlFlag(
+                limit_increase_amount=max_iterations,
+                current_value=current_iteration,
+                max_value=max_iterations,
+            )
+
+        # Update the state
        self.__dict__.update(state)

+        # We keep the deprecated fields for backward compatibility
+        # They will be removed by __getstate__ when the state is saved again
+
        # make sure we always have the attribute history
        if not hasattr(self, 'history'):
            self.history = []

+        # Ensure we have default values for new fields if they're missing
+        if not hasattr(self, 'iteration_flag'):
+            self.iteration_flag = IterationControlFlag(
+                limit_increase_amount=100, current_value=0, max_value=100
+            )
+
+        if not hasattr(self, 'budget_flag'):
+            self.budget_flag = None
+
    def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
        """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
        last_user_message = None
@@ -223,6 +276,17 @@ class State:
            ],
        }

+    def get_local_step(self):
+        if not self.parent_iteration:
+            return self.iteration_flag.current_value
+
+        return self.iteration_flag.current_value - self.parent_iteration
+
+    def get_local_metrics(self):
+        if not self.parent_metrics_snapshot:
+            return self.metrics
+        return self.metrics.diff(self.parent_metrics_snapshot)
+
    @property
    def view(self) -> View:
        # Compute a simple checksum from the history to see if we can re-use any
--- a/openhands/controller/state/state_tracker.py
+++ b/openhands/controller/state/state_tracker.py
@@ -0,0 +1,282 @@
+from openhands.controller.agent import Agent
+from openhands.controller.state.control_flags import BudgetControlFlag, IterationControlFlag
+from openhands.controller.state.state import State
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.agent import AgentDelegateAction, ChangeAgentStateAction
+from openhands.events.action.empty import NullAction
+from openhands.events.event import Event
+from openhands.events.event_filter import EventFilter
+from openhands.events.observation.agent import AgentStateChangedObservation
+from openhands.events.observation.delegate import AgentDelegateObservation
+from openhands.events.observation.empty import NullObservation
+from openhands.events.serialization.event import event_to_trajectory
+from openhands.events.stream import EventStream
+from openhands.llm.metrics import Metrics
+from openhands.storage.files import FileStore
+
+
+class StateTracker:
+    """Manages and synchronizes the state of an agent throughout its lifecycle.
+
+    It is responsible for:
+    1. Maintaining agent state persistence across sessions
+    2. Managing agent history by filtering and tracking relevant events (previously done in the agent controller)
+    3. Synchronizing metrics between the controller and LLM components
+    4. Updating control flags for budget and iteration limits
+
+    """
+
+    def __init__(
+        self, sid: str | None, file_store: FileStore | None, user_id: str | None
+    ):
+        self.sid = sid
+        self.file_store = file_store
+        self.user_id = user_id
+
+        # filter out events that are not relevant to the agent
+        # so they will not be included in the agent history
+        self.agent_history_filter = EventFilter(
+            exclude_types=(
+                NullAction,
+                NullObservation,
+                ChangeAgentStateAction,
+                AgentStateChangedObservation,
+            ),
+            exclude_hidden=True,
+        )
+
+    def set_initial_state(
+        self,
+        id: str,
+        agent: Agent,
+        state: State | None,
+        max_iterations: int,
+        max_budget_per_task: float | None,
+        confirmation_mode: bool = False,
+    ) -> None:
+        """Sets the initial state for the agent, either from the previous session, or from a parent agent, or by creating a new one.
+
+        Args:
+            state: The state to initialize with, or None to create a new state.
+            max_iterations: The maximum number of iterations allowed for the task.
+            confirmation_mode: Whether to enable confirmation mode.
+        """
+        # state can come from:
+        # - the previous session, in which case it has history
+        # - from a parent agent, in which case it has no history
+        # - None / a new state
+
+        # If state is None, we create a brand new state and still load the event stream so we can restore the history
+        if state is None:
+            self.state = State(
+                session_id=id.removesuffix('-delegate'),
+                inputs={},
+                iteration_flag=IterationControlFlag(limit_increase_amount=max_iterations, current_value=0, max_value= max_iterations),
+                budget_flag=None if not max_budget_per_task else BudgetControlFlag(limit_increase_amount=max_budget_per_task, current_value=0, max_value=max_budget_per_task),
+                confirmation_mode=confirmation_mode
+            )
+            self.state.start_id = 0
+
+            logger.info(
+                f'AgentController {id} - created new state. start_id: {self.state.start_id}'
+            )
+        else:
+            self.state = state
+            if self.state.start_id <= -1:
+                self.state.start_id = 0
+
+            logger.info(
+                f'AgentController {id} initializing history from event {self.state.start_id}',
+            )
+
+
+        # Share the state metrics with the agent's LLM metrics
+        # This ensures that all accumulated metrics are always in sync between controller and llm
+        agent.llm.metrics = self.state.metrics
+
+    def _init_history(self, event_stream: EventStream) -> None:
+        """Initializes the agent's history from the event stream.
+
+        The history is a list of events that:
+        - Excludes events of types listed in self.filter_out
+        - Excludes events with hidden=True attribute
+        - For delegate events (between AgentDelegateAction and AgentDelegateObservation):
+            - Excludes all events between the action and observation
+            - Includes the delegate action and observation themselves
+        """
+        # define range of events to fetch
+        # delegates start with a start_id and initially won't find any events
+        # otherwise we're restoring a previous session
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else event_stream.get_latest_event_id()
+        )
+
+        # sanity check
+        if start_id > end_id + 1:
+            logger.warning(
+                f'start_id {start_id} is greater than end_id + 1 ({end_id + 1}). History will be empty.',
+            )
+            self.state.history = []
+            return
+
+        events: list[Event] = []
+
+        # Get rest of history
+        events_to_add = list(
+            event_stream.search_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter=self.agent_history_filter,
+            )
+        )
+        events.extend(events_to_add)
+
+        # Find all delegate action/observation pairs
+        delegate_ranges: list[tuple[int, int]] = []
+        delegate_action_ids: list[int] = []  # stack of unmatched delegate action IDs
+
+        for event in events:
+            if isinstance(event, AgentDelegateAction):
+                delegate_action_ids.append(event.id)
+                # Note: we can get agent=event.agent and task=event.inputs.get('task','')
+                # if we need to track these in the future
+
+            elif isinstance(event, AgentDelegateObservation):
+                # Match with most recent unmatched delegate action
+                if not delegate_action_ids:
+                    logger.warning(
+                        f'Found AgentDelegateObservation without matching action at id={event.id}',
+                    )
+                    continue
+
+                action_id = delegate_action_ids.pop()
+                delegate_ranges.append((action_id, event.id))
+
+        # Filter out events between delegate action/observation pairs
+        if delegate_ranges:
+            filtered_events: list[Event] = []
+            current_idx = 0
+
+            for start_id, end_id in sorted(delegate_ranges):
+                # Add events before delegate range
+                filtered_events.extend(
+                    event for event in events[current_idx:] if event.id < start_id
+                )
+
+                # Add delegate action and observation
+                filtered_events.extend(
+                    event for event in events if event.id in (start_id, end_id)
+                )
+
+                # Update index to after delegate range
+                current_idx = next(
+                    (i for i, e in enumerate(events) if e.id > end_id), len(events)
+                )
+
+            # Add any remaining events after last delegate range
+            filtered_events.extend(events[current_idx:])
+
+            self.state.history = filtered_events
+        else:
+            self.state.history = events
+
+        # make sure history is in sync
+        self.state.start_id = start_id
+
+    def close(self, event_stream: EventStream):
+        # we made history, now is the time to rewrite it!
+        # the final state.history will be used by external scripts like evals, tests, etc.
+        # history will need to be complete WITH delegates events
+        # like the regular agent history, it does not include:
+        # - 'hidden' events, events with hidden=True
+        # - backend events (the default 'filtered out' types, types in self.filter_out)
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else event_stream.get_latest_event_id()
+        )
+
+        self.state.history = list(
+            event_stream.search_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter=self.agent_history_filter,
+            )
+        )
+
+    def add_history(self, event: Event):
+        # if the event is not filtered out, add it to the history
+        if self.agent_history_filter.include(event):
+            self.state.history.append(event)
+
+    def get_trajectory(self, include_screenshots: bool = False) -> list[dict]:
+        return [
+            event_to_trajectory(event, include_screenshots)
+            for event in self.state.history
+        ]
+
+    def maybe_increase_control_flags_limits(
+        self, headless_mode: bool
+    ):
+        # Iteration and budget extensions are independent of each other
+        # An error will be thrown if any one of the control flags have reached or exceeded its limit
+        self.state.iteration_flag.increase_limit(headless_mode)
+        if self.state.budget_flag:
+            self.state.budget_flag.increase_limit(headless_mode)
+
+    def get_metrics_snapshot(self):
+        """
+        Deep copy of metrics
+        This serves as a snapshot for the parent's metrics at the time a delegate is created
+        It will be stored and used to compute local metrics for the delegate
+        (since delegates now accumulate metrics from where its parent left off)
+        """
+
+        return self.state.metrics.copy()
+
+    def save_state(self):
+        """
+        Save's current state to persistent store
+        """
+        if self.sid and self.file_store:
+            self.state.save_to_session(self.sid, self.file_store, self.user_id)
+
+
+    def run_control_flags(self):
+        """
+        Performs one step of the control flags
+        """
+        self.state.iteration_flag.step()
+        if self.state.budget_flag:
+            self.state.budget_flag.step()
+
+
+    def sync_budget_flag_with_metrics(self):
+        """
+            Ensures that budget flag is up to date with accumulated costs from llm completions
+            Budget flag will monitor for when budget is exceeded
+        """
+        if self.state.budget_flag:
+            self.state.budget_flag.current_value = self.state.metrics.accumulated_cost
+
+    def merge_metrics(self, metrics: Metrics):
+        """
+            Merges metrics with the state metrics
+
+            NOTE: this should be refactored in the future. We should have services (draft llm, title autocomplete, condenser, etc)
+            use their own LLMs, but the metrics object should be shared. This way we have one source of truth for accumulated costs from
+            all services
+
+            This would prevent having fragmented stores for metrics, and we don't have the burden of deciding where and how to store them
+            if we decide introduce more specialized services that require llm completions
+
+        """
+        self.state.metrics.merge(metrics)
+        if self.state.budget_flag:
+            self.state.budget_flag.current_value = self.state.metrics.accumulated_cost