diff --git a/agenthub/browsing_agent/browsing_agent.py b/agenthub/browsing_agent/browsing_agent.py
index c617bbc6dd..926d9c8338 100644
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -15,6 +15,7 @@ from opendevin.events.action import (
 )
 from opendevin.events.event import EventSource
 from opendevin.events.observation import BrowserOutputObservation
+from opendevin.events.observation.observation import Observation
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
     PluginRequirement,
@@ -146,23 +147,21 @@ class BrowsingAgent(Agent):
         last_obs = None
         last_action = None
 
-        if EVAL_MODE and len(state.history) == 1:
+        if EVAL_MODE and len(state.history.get_events_as_list()) == 1:
             # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
             # initialize and retrieve the first observation by issuing an noop OP
             # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
             return BrowseInteractiveAction(browser_actions='noop()')
 
-        for prev_action, obs in state.history:
-            if isinstance(prev_action, BrowseInteractiveAction):
-                prev_actions.append(prev_action.browser_actions)
-                last_obs = obs
-                last_action = prev_action
-            elif (
-                isinstance(prev_action, MessageAction)
-                and prev_action.source == EventSource.AGENT
-            ):
-                # agent has responded, task finish.
-                return AgentFinishAction(outputs={'content': prev_action.content})
+        for event in state.history.get_events():
+            if isinstance(event, BrowseInteractiveAction):
+                prev_actions.append(event.browser_actions)
+                last_action = event
+            elif isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                # agent has responded, task finished.
+                return AgentFinishAction(outputs={'content': event.content})
+            elif isinstance(event, Observation):
+                last_obs = event
 
         if EVAL_MODE:
             prev_actions = prev_actions[1:]  # remove the first noop action
@@ -207,7 +206,7 @@ class BrowsingAgent(Agent):
 
         prompt = get_prompt(error_prefix, cur_axtree_txt, prev_action_str)
         messages.append({'role': 'user', 'content': prompt})
-        logger.info(prompt)
+        logger.debug(prompt)
         response = self.llm.completion(
             messages=messages,
             temperature=0.0,
diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
index 226f1c029c..494538e2ac 100644
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -182,27 +182,14 @@ class CodeActAgent(Agent):
         - MessageAction(content) - Message action to run (e.g. ask for clarification)
         - AgentFinishAction() - end the interaction
         """
-        messages: list[dict[str, str]] = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
-        ]
 
-        for prev_action, obs in state.history:
-            action_message = get_action_message(prev_action)
-            if action_message:
-                messages.append(action_message)
+        # if we're done, go back
+        latest_user_message = state.history.get_last_user_message()
+        if latest_user_message and latest_user_message.strip() == '/exit':
+            return AgentFinishAction()
 
-            obs_message = get_observation_message(obs)
-            if obs_message:
-                messages.append(obs_message)
-
-        latest_user_message = [m for m in messages if m['role'] == 'user'][-1]
-        if latest_user_message:
-            if latest_user_message['content'].strip() == '/exit':
-                return AgentFinishAction()
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
-            )
+        # prepare what we want to send to the LLM
+        messages: list[dict[str, str]] = self._get_messages(state)
 
         response = self.llm.completion(
             messages=messages,
@@ -217,3 +204,35 @@ class CodeActAgent(Agent):
 
     def search_memory(self, query: str) -> list[str]:
         raise NotImplementedError('Implement this abstract method')
+
+    def _get_messages(self, state: State) -> list[dict[str, str]]:
+        messages = [
+            {'role': 'system', 'content': self.system_message},
+            {'role': 'user', 'content': self.in_context_example},
+        ]
+
+        for event in state.history.get_events():
+            # create a regular message from an event
+            message = (
+                get_action_message(event)
+                if isinstance(event, Action)
+                else get_observation_message(event)
+            )
+
+            # add regular message
+            if message:
+                messages.append(message)
+
+        # the latest user message is important:
+        # we want to remind the agent of the environment constraints
+        latest_user_message = next(
+            (m for m in reversed(messages) if m['role'] == 'user'), None
+        )
+
+        # add a reminder to the prompt
+        if latest_user_message:
+            latest_user_message['content'] += (
+                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>'
+            )
+
+        return messages
diff --git a/agenthub/codeact_swe_agent/codeact_swe_agent.py b/agenthub/codeact_swe_agent/codeact_swe_agent.py
index 12ad556cd4..82349ecd50 100644
--- a/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -138,27 +138,14 @@ class CodeActSWEAgent(Agent):
         - MessageAction(content) - Message action to run (e.g. ask for clarification)
         - AgentFinishAction() - end the interaction
         """
-        messages: list[dict[str, str]] = [
-            {'role': 'system', 'content': self.system_message},
-            {'role': 'user', 'content': self.in_context_example},
-        ]
 
-        for prev_action, obs in state.history:
-            action_message = get_action_message(prev_action)
-            if action_message:
-                messages.append(action_message)
+        # if we're done, go back
+        latest_user_message = state.history.get_last_user_message()
+        if latest_user_message and latest_user_message.strip() == '/exit':
+            return AgentFinishAction()
 
-            obs_message = get_observation_message(obs)
-            if obs_message:
-                messages.append(obs_message)
-
-        latest_user_message = [m for m in messages if m['role'] == 'user'][-1]
-        if latest_user_message:
-            if latest_user_message['content'].strip() == '/exit':
-                return AgentFinishAction()
-            latest_user_message['content'] += (
-                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
-            )
+        # prepare what we want to send to the LLM
+        messages: list[dict[str, str]] = self._get_messages(state)
 
         response = self.llm.completion(
             messages=messages,
@@ -173,3 +160,35 @@ class CodeActSWEAgent(Agent):
 
     def search_memory(self, query: str) -> list[str]:
         raise NotImplementedError('Implement this abstract method')
+
+    def _get_messages(self, state: State) -> list[dict[str, str]]:
+        messages = [
+            {'role': 'system', 'content': self.system_message},
+            {'role': 'user', 'content': self.in_context_example},
+        ]
+
+        for event in state.history.get_events():
+            # create a regular message from an event
+            message = (
+                get_action_message(event)
+                if isinstance(event, Action)
+                else get_observation_message(event)
+            )
+
+            # add regular message
+            if message:
+                messages.append(message)
+
+        # the latest user message is important:
+        # we want to remind the agent of the environment constraints
+        latest_user_message = next(
+            (m for m in reversed(messages) if m['role'] == 'user'), None
+        )
+
+        # add a reminder to the prompt
+        if latest_user_message:
+            latest_user_message['content'] += (
+                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
+            )
+
+        return messages
diff --git a/agenthub/delegator_agent/agent.py b/agenthub/delegator_agent/agent.py
index 2e44ff9e68..f97c5e5096 100644
--- a/agenthub/delegator_agent/agent.py
+++ b/agenthub/delegator_agent/agent.py
@@ -41,7 +41,9 @@ class DelegatorAgent(Agent):
                 agent='StudyRepoForTaskAgent', inputs={'task': task}
             )
 
-        last_observation = state.history[-1][1]
+        # last observation in history should be from the delegate
+        last_observation = state.history.get_last_observation()
+
         if not isinstance(last_observation, AgentDelegateObservation):
             raise Exception('Last observation is not an AgentDelegateObservation')
 
diff --git a/agenthub/dummy_agent/agent.py b/agenthub/dummy_agent/agent.py
index 4b285eaa48..ea72e86e3c 100644
--- a/agenthub/dummy_agent/agent.py
+++ b/agenthub/dummy_agent/agent.py
@@ -125,11 +125,16 @@ class DummyAgent(Agent):
         time.sleep(0.1)
         if state.iteration > 0:
             prev_step = self.steps[state.iteration - 1]
+
+            # a step is (action, observations list)
             if 'observations' in prev_step:
+                # one obs, at most
                 expected_observations = prev_step['observations']
-                hist_start = len(state.history) - len(expected_observations)
+
+                # check if the history matches the expected observations
+                hist_events = state.history.get_last_events(len(expected_observations))
                 for i in range(len(expected_observations)):
-                    hist_obs = event_to_dict(state.history[hist_start + i][1])
+                    hist_obs = event_to_dict(hist_events[i])
                     expected_obs = event_to_dict(expected_observations[i])
                     if (
                         'command_id' in hist_obs['extras']
@@ -143,9 +148,6 @@ class DummyAgent(Agent):
                     ):
                         del expected_obs['extras']['command_id']
                         expected_obs['content'] = ''
-                    if hist_obs != expected_obs:
-                        print('\nactual', hist_obs)
-                        print('\nexpect', expected_obs)
                     assert (
                         hist_obs == expected_obs
                     ), f'Expected observation {expected_obs}, got {hist_obs}'
diff --git a/agenthub/micro/agent.py b/agenthub/micro/agent.py
index 863a6d3ad8..c5b568af08 100644
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@@ -7,6 +7,7 @@ from opendevin.events.action import Action
 from opendevin.events.serialization.action import action_from_dict
 from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
+from opendevin.memory.history import ShortTermHistory
 
 from .instructions import instructions
 from .registry import all_microagents
@@ -27,18 +28,24 @@ def to_json(obj, **kwargs):
     return json.dumps(obj, **kwargs)
 
 
-def history_to_json(obj, **kwargs):
+def history_to_json(history: ShortTermHistory, max_events=20, **kwargs):
     """
     Serialize and simplify history to str format
     """
-    if isinstance(obj, list):
-        # process history, make it simpler.
-        processed_history = []
-        for action, observation in obj:
-            processed_history.append(
-                (event_to_memory(action), event_to_memory(observation))
-            )
-        return json.dumps(processed_history, **kwargs)
+
+    processed_history = []
+    event_count = 0
+
+    for event in history.get_events(reverse=True):
+        if event_count >= max_events:
+            break
+        processed_history.append(event_to_memory(event))
+        event_count += 1
+
+    # history is in reverse order, let's fix it
+    processed_history.reverse()
+
+    return json.dumps(processed_history, **kwargs)
 
 
 class MicroAgent(Agent):
diff --git a/agenthub/micro/coder/prompt.md b/agenthub/micro/coder/prompt.md
index 5b8d3195f3..31d4439e2b 100644
--- a/agenthub/micro/coder/prompt.md
+++ b/agenthub/micro/coder/prompt.md
@@ -21,7 +21,7 @@ Do NOT finish until you have completed the tasks.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/agenthub/micro/commit_writer/prompt.md b/agenthub/micro/commit_writer/prompt.md
index eabb79f613..4b857ca93a 100644
--- a/agenthub/micro/commit_writer/prompt.md
+++ b/agenthub/micro/commit_writer/prompt.md
@@ -20,7 +20,7 @@ action with `outputs.answer` set to the answer.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/agenthub/micro/manager/prompt.md b/agenthub/micro/manager/prompt.md
index 0e05a6c592..c21f275fa3 100644
--- a/agenthub/micro/manager/prompt.md
+++ b/agenthub/micro/manager/prompt.md
@@ -27,7 +27,7 @@ you have delegated to, and why they failed).
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/agenthub/micro/math_agent/prompt.md b/agenthub/micro/math_agent/prompt.md
index c286407160..08cf99a1cd 100644
--- a/agenthub/micro/math_agent/prompt.md
+++ b/agenthub/micro/math_agent/prompt.md
@@ -10,7 +10,7 @@ and call the `finish` action with `outputs.answer` set to the answer.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/agenthub/micro/postgres_agent/prompt.md b/agenthub/micro/postgres_agent/prompt.md
index 5ddbfcb78a..aca2a38f6b 100644
--- a/agenthub/micro/postgres_agent/prompt.md
+++ b/agenthub/micro/postgres_agent/prompt.md
@@ -18,7 +18,7 @@ You may take any of the following actions:
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/agenthub/micro/repo_explorer/prompt.md b/agenthub/micro/repo_explorer/prompt.md
index fde381e40c..b317a4e77b 100644
--- a/agenthub/micro/repo_explorer/prompt.md
+++ b/agenthub/micro/repo_explorer/prompt.md
@@ -20,7 +20,7 @@ When you're done, put your summary into the output of the `finish` action.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/agenthub/micro/study_repo_for_task/prompt.md b/agenthub/micro/study_repo_for_task/prompt.md
index 46a64aae4b..91cdf3c3c6 100644
--- a/agenthub/micro/study_repo_for_task/prompt.md
+++ b/agenthub/micro/study_repo_for_task/prompt.md
@@ -24,7 +24,7 @@ implement the solution. If the codebase is empty, you should call the `finish` a
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/agenthub/micro/typo_fixer_agent/prompt.md b/agenthub/micro/typo_fixer_agent/prompt.md
index a4d04769b3..4440d9b56f 100644
--- a/agenthub/micro/typo_fixer_agent/prompt.md
+++ b/agenthub/micro/typo_fixer_agent/prompt.md
@@ -31,7 +31,7 @@ Do NOT finish until you have fixed all the typos and generated a summary.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-5:]) }}
+{{ history_to_json(state.history, max_events=10) }}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/agenthub/micro/verifier/prompt.md b/agenthub/micro/verifier/prompt.md
index 7e5ef79a2d..48c7a73cc4 100644
--- a/agenthub/micro/verifier/prompt.md
+++ b/agenthub/micro/verifier/prompt.md
@@ -22,7 +22,7 @@ explaining what the problem is.
 
 ## History
 {{ instructions.history_truncated }}
-{{ history_to_json(state.history[-10:]) }}
+{{ history_to_json(state.history, max_events=20) }}
 
 ## Format
 {{ instructions.format.action }}
diff --git a/agenthub/monologue_agent/agent.py b/agenthub/monologue_agent/agent.py
index e40bea4b7e..ef7df03b70 100644
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -32,9 +32,6 @@ from opendevin.runtime.tools import RuntimeTool
 if config.agent.memory_enabled:
     from opendevin.memory.memory import LongTermMemory
 
-MAX_TOKEN_COUNT_PADDING = 512
-MAX_OUTPUT_LENGTH = 5000
-
 
 class MonologueAgent(Agent):
     VERSION = '1.0'
@@ -68,7 +65,7 @@ class MonologueAgent(Agent):
         Will execute again when called after reset.
 
         Parameters:
-        - task (str): The initial goal statement provided by the user
+        - task: The initial goal statement provided by the user
 
         Raises:
         - AgentNoInstructionError: If task is not provided
@@ -155,16 +152,20 @@ class MonologueAgent(Agent):
         recent_events: list[dict[str, str]] = []
 
         # add the events from state.history
-        for prev_action, obs in state.history:
-            if not isinstance(prev_action, NullAction):
-                recent_events.append(event_to_memory(prev_action))
-            if not isinstance(obs, NullObservation):
-                recent_events.append(event_to_memory(obs))
+        for event in state.history.get_events():
+            recent_events.append(event_to_memory(event))
 
         # add the last messages to long term memory
-        if self.memory is not None and state.history and len(state.history) > 0:
-            self.memory.add_event(event_to_memory(state.history[-1][0]))
-            self.memory.add_event(event_to_memory(state.history[-1][1]))
+        if self.memory is not None:
+            last_action = state.history.get_last_action()
+            last_observation = state.history.get_last_observation()
+
+            # this should still work
+            # we will need to do this differently: find out if there really is an action or an observation in this step
+            if last_action:
+                self.memory.add_event(event_to_memory(last_action))
+            if last_observation:
+                self.memory.add_event(event_to_memory(last_observation))
 
         # the action prompt with initial thoughts and recent events
         prompt = prompts.get_request_action_prompt(
@@ -188,10 +189,10 @@ class MonologueAgent(Agent):
         Uses search to produce top 10 results.
 
         Parameters:
-        - query (str): The query that we want to find related memories for
+        - The query that we want to find related memories for
 
         Returns:
-        - list[str]: A list of top 10 text results that matched the query
+        - A list of top 10 text results that matched the query
         """
         if self.memory is None:
             return []
diff --git a/agenthub/planner_agent/prompt.py b/agenthub/planner_agent/prompt.py
index a513e797cd..be9a20338d 100644
--- a/agenthub/planner_agent/prompt.py
+++ b/agenthub/planner_agent/prompt.py
@@ -6,13 +6,10 @@ from opendevin.events.action import (
     Action,
     NullAction,
 )
-from opendevin.events.observation import (
-    NullObservation,
-)
 from opendevin.events.serialization.action import action_from_dict
 from opendevin.events.serialization.event import event_to_memory
 
-HISTORY_SIZE = 10
+HISTORY_SIZE = 20
 
 prompt = """
 # Task
@@ -132,18 +129,28 @@ def get_prompt(state: State) -> str:
     - str: The formatted string prompt with historical values
     """
 
+    # the plan
     plan_str = json.dumps(state.root_task.to_dict(), indent=2)
-    sub_history = state.history[-HISTORY_SIZE:]
+
+    # the history
     history_dicts = []
     latest_action: Action = NullAction()
-    for action, observation in sub_history:
-        if not isinstance(action, NullAction):
-            history_dicts.append(event_to_memory(action))
-            latest_action = action
-        if not isinstance(observation, NullObservation):
-            observation_dict = event_to_memory(observation)
-            history_dicts.append(observation_dict)
+
+    # retrieve the latest HISTORY_SIZE events
+    for event_count, event in enumerate(state.history.get_events(reverse=True)):
+        if event_count >= HISTORY_SIZE:
+            break
+        if latest_action == NullAction() and isinstance(event, Action):
+            latest_action = event
+        history_dicts.append(event_to_memory(event))
+
+    # history_dicts is in reverse order, lets fix it
+    history_dicts.reverse()
+
+    # and get it as a JSON string
     history_str = json.dumps(history_dicts, indent=2)
+
+    # the plan status
     current_task = state.root_task.get_current_task()
     if current_task is not None:
         plan_status = f"You're currently working on this task:\n{current_task.goal}."
@@ -151,9 +158,15 @@ def get_prompt(state: State) -> str:
             plan_status += "\nIf it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW."
     else:
         plan_status = "You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress."
+
+    # the hint, based on the last action
     hint = get_hint(event_to_memory(latest_action).get('action', ''))
     logger.info('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})
+
+    # the last relevant user message (the task)
     task = state.get_current_user_intent()
+
+    # finally, fill in the prompt
     return prompt % {
         'task': task,
         'plan': plan_str,
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
index 6edbcefc9f..a1839af6a7 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -25,7 +25,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 game = None
@@ -42,11 +41,14 @@ def cleanup():
 def codeact_user_response_eda(state: State) -> str:
     global game
     model_guess = ''
+
+    # retrieve the latest model message from history
     if state.history:
-        for act, _ in reversed(state.history):
-            if isinstance(act, MessageAction) and act.source == 'agent':
-                model_guess = act.content
+        for event in state.history.get_events(reverse=True):
+            if isinstance(event, MessageAction) and event.source == 'agent':
+                model_guess = event.content
                 break
+
     assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
     game.curr_turn += 1
@@ -149,24 +151,27 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     final_message = ''
-    for act, _ in reversed(state.history):
-        if isinstance(act, MessageAction) and act.source == 'agent':
-            final_message = act.content
+    for event in state.history.get_events(reverse=True):
+        if isinstance(event, MessageAction) and event.source == 'agent':
+            final_message = event.content
             break
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()
     metrics = state.metrics.get() if state.metrics else None
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'instance_id': instance['text'].strip(),
         'instance': instance,
         'instruction': instruction,
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': {
diff --git a/evaluation/TUTORIAL.md b/evaluation/TUTORIAL.md
index b29ae7a43b..b6e1853671 100644
--- a/evaluation/TUTORIAL.md
+++ b/evaluation/TUTORIAL.md
@@ -100,13 +100,14 @@ def codeact_user_response(state: State) -> str:
         'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
         'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
     )
+    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
     if state.history:
         user_msgs = [
-            action
-            for action, _ in state.history
-            if isinstance(action, MessageAction) and action.source == 'agent'
+            event
+            for event in state.history.get_events()
+            if isinstance(action, MessageAction) and action.source == 'user'
         ]
-        if len(user_msgs) >= 2:
+        if len(user_msgs) > 2:
             # let the agent know that it can give up when it has tried 3 times
             return (
                 msg
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
index 4e089ae1aa..0457bfdd08 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -27,7 +27,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 
@@ -145,13 +144,15 @@ def process_instance(
     else:
         logger.info('Retrieving agent answer from history.')
         raw_ans = ''
-        for act, _ in reversed(state.history):
-            if isinstance(act, MessageAction) and act.source == 'agent':
-                raw_ans = act.content
-                break
-            if isinstance(act, CmdRunAction) and act.source == 'agent':
-                raw_ans = act.thought
-                break
+
+        # retrieve the last agent message or thought
+        for event in state.history.get_events(reverse=True):
+            if isinstance(event, MessageAction) and event.source == 'agent':
+                raw_ans = event.content
+            elif isinstance(event, CmdRunAction) and event.source == 'agent':
+                raw_ans = event.thought
+
+        # parse the answer for a solution tag
         agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
         if len(agent_answer) == 0:
             logger.warning(f'Failed to parse model answer: {raw_ans}')
@@ -179,9 +180,11 @@ def process_instance(
     )
     test_result = compare_results(comparison_method, agent_answer, final_ans)
 
-    histories = [
-        (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-    ]
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
index 464c59b35f..ec4913d33e 100644
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -24,7 +24,6 @@ from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 
@@ -196,6 +195,11 @@ def process_instance(
         raise ValueError('State should not be None.')
     metrics = state.metrics.get() if state.metrics else None
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'test_case_id': instance.test_case_id,
@@ -203,9 +207,7 @@ def process_instance(
         'instruction': instruction,
         'generated': test_result['metadata']['1_copy_change_code'],
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': test_result,
diff --git a/evaluation/bird/README.md b/evaluation/bird/README.md
index 250aefb5e5..86e3d9d091 100644
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
@@ -143,7 +143,6 @@ For each problem, OpenDevin is given a set number of iterations to fix the faili
         "action": "run",
         "args": {
           "command": "python3 0.py",
-          "background": false,
           "thought": "The Python code with the SQL query has been written to the file `0.py`. Now, let's run the Python script to execute the SQL query and get the result."
         }
       },
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
index f3cb1ec505..b3ba0c97d5 100644
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -27,7 +27,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 
@@ -46,12 +45,13 @@ def codeact_user_response(state: State) -> str:
         'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
     )
     if state.history:
+        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
-            action
-            for action, _ in state.history
-            if isinstance(action, MessageAction) and action.source == 'user'
+            event
+            for event in state.history.get_events()
+            if isinstance(event, MessageAction) and event.source == 'user'
         ]
-        if len(user_msgs) >= 2:
+        if len(user_msgs) > 2:
             # let the agent know that it can give up when it has tried 3 times
             return (
                 msg
@@ -245,14 +245,17 @@ def process_instance(
         raise ValueError('State should not be None.')
     metrics = state.metrics.get() if state.metrics else None
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'task_id': instance.task_id,
         'instruction': instruction,
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': test_result,
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
index e732c0eed7..07fa24b538 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -26,7 +26,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import CmdRunAction, MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
@@ -136,13 +135,13 @@ def process_instance(
             raise ValueError('State should not be None.')
 
         model_answer_raw = ''
-        for act, _ in reversed(state.history):
-            if isinstance(act, CmdRunAction) and act.source == 'agent':
-                model_answer_raw = act.thought
-                break
-            elif isinstance(act, MessageAction) and act.source == 'agent':
-                model_answer_raw = act.content
-                break
+
+        # get the last message or thought from the agent
+        for event in state.history.get_events(reverse=True):
+            if isinstance(event, CmdRunAction) and event.source == 'agent':
+                model_answer_raw = event.thought
+            elif isinstance(event, MessageAction) and event.source == 'agent':
+                model_answer_raw = event.content
 
         # attempt to parse model_answer
         model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
@@ -166,16 +165,18 @@ def process_instance(
         }
         metrics = state.metrics.get() if state.metrics else None
 
+        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+        # for compatibility with the existing output format, we can remake the pairs here
+        # remove when it becomes unnecessary
+        histories = state.history.compatibility_for_eval_history_pairs()
+
         # Save the output
         output = {
             'instance_id': instance['task_id'],
             'instance': instance,
             'instruction': instance['Question'],
             'metadata': metadata.model_dump(),
-            'history': [
-                (event_to_dict(action), event_to_dict(obs))
-                for action, obs in state.history
-            ],
+            'history': histories,
             'metrics': metrics,
             'error': state.last_error if state and state.last_error else None,
             'test_result': test_result,
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
index 1e1750ba0f..aceb3ad0c9 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -17,7 +17,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 from .utils import encode_question, get_data
@@ -37,13 +36,15 @@ def codeact_user_response(state: State) -> str:
         'Please run the following command: <execute_bash> exit </execute_bash>.\n'
         #'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
     )
+
+    # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
     if state.history:
         user_msgs = [
-            action
-            for action, _ in state.history
-            if isinstance(action, MessageAction) and action.source == 'user'
+            event
+            for event in state.history.get_events()
+            if isinstance(event, MessageAction) and event.source == 'user'
         ]
-        if len(user_msgs) >= 2:
+        if len(user_msgs) > 2:
             # let the agent know that it can give up when it has tried 3 times
             return (
                 msg
@@ -131,10 +132,12 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
             raise ValueError('State should not be None.')
 
         model_answer_raw = ''
-        for act, _ in reversed(state.history):
-            if isinstance(act, MessageAction) and act.source == 'agent':
-                model_answer_raw = act.content
-                break
+
+        # retrieve the last message from the agent
+        for event in state.history.get_events(reverse=True):
+            if isinstance(event, MessageAction) and event.source == 'agent':
+                model_answer_raw = event
+
         # attempt to parse model_answer
         _, _, ast_eval = get_data(metadata['hub'])
         correct, hallucination = ast_eval(question_id, model_answer_raw)
@@ -142,6 +145,12 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
         logger.info(
             f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
         )
+
+        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+        # for compatibility with the existing output format, we can remake the pairs here
+        # remove when it becomes unnecessary
+        histories = state.history.compatibility_for_eval_history_pairs()
+
         # Save the output
         output = {
             'question_id': question_id,
@@ -151,10 +160,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
             'answer_id': 'None',
             'model_id': metadata['model_name'],
             'metadata': metadata.model_dump(),
-            'history': [
-                (event_to_dict(action), event_to_dict(obs))
-                for action, obs in state.history
-            ],
+            'history': histories,
             'metrics': metrics,
             'error': state.last_error if state and state.last_error else None,
         }
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
index fd75ef0b8d..1ee0dc1b3e 100644
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -42,7 +42,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
@@ -214,7 +213,7 @@ def process_instance(
         final_message = next(
             (
                 act.content
-                for act in reversed(state.history)
+                for act in state.history.get_events(reverse=True)
                 if isinstance(act, MessageAction)
             ),
             None,
@@ -231,16 +230,18 @@ def process_instance(
 
         metrics = state.metrics.get() if state.metrics else None
 
+        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+        # for compatibility with the existing output format, we can remake the pairs here
+        # remove when it becomes unnecessary
+        histories = state.history.compatibility_for_eval_history_pairs()
+
         # Save the output
         output = {
             'task_id': instance.task_id,
             'instance_id': instance.instance_id,
             'instruction': instruction,
             'metadata': metadata.model_dump(),
-            'history': [
-                (event_to_dict(action), event_to_dict(obs))
-                for action, obs in state.history
-            ],
+            'history': histories,
             'metrics': metrics,
             'error': state.last_error if state and state.last_error else None,
             'test_result': test_result,
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
index ec735be242..8c1e24f518 100644
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -32,7 +32,6 @@ from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 IMPORT_HELPER = {
@@ -202,15 +201,17 @@ def process_instance(
             raise ValueError('State should not be None.')
         metrics = state.metrics.get() if state.metrics else None
 
+        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+        # for compatibility with the existing output format, we can remake the pairs here
+        # remove when it becomes unnecessary
+        histories = state.history.compatibility_for_eval_history_pairs()
+
         # Save the output
         output = {
             'task_id': instance.task_id,
             'instruction': instruction,
             'metadata': metadata.model_dump(),
-            'history': [
-                (event_to_dict(action), event_to_dict(obs))
-                for action, obs in state.history
-            ],
+            'history': histories,
             'metrics': metrics,
             'error': state.last_error if state and state.last_error else None,
             'test_result': test_result,
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
index 6eb28162c5..75fe39b2a7 100644
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -22,7 +22,6 @@ from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
@@ -199,12 +198,12 @@ def process_instance(
 
         final_message = ''
         messages = []
-        for action, obs in reversed(state.history):
-            # if isinstance(act, MessageAction):
-            messages.append(obs.content)
-            # print("obs.content:", obs.content)
-            if str(obs.content) in ["'A'", "'B'", "'C'"]:
-                final_message = obs.content
+        for event in state.history.get_events(reverse=True):
+            # will this be a MessageAction?
+            # TODO we can filter for types of events if we know what to expect
+            messages.append(event.content)
+            if str(event.content) in ["'A'", "'B'", "'C'"]:
+                final_message = event.content
                 break
 
         final_message = final_message.strip("'")
@@ -217,16 +216,18 @@ def process_instance(
         )
         metrics = state.metrics.get() if state.metrics else None
 
+        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+        # for compatibility with the existing output format, we can remake the pairs here
+        # remove when it becomes unnecessary
+        histories = state.history.compatibility_for_eval_history_pairs()
+
         # Save the output
         output = {
             'id': instance['id'],
             'instance': instance,
             'instruction': instruction,
             # 'metadata': metadata.model_dump(),
-            'history': [
-                (event_to_dict(action), event_to_dict(obs))
-                for action, obs in state.history
-            ],
+            'history': histories,
             'metrics': metrics,
             'final_message': final_message,
             'messages': messages,
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
index 63bfc47e9d..3ef71a030d 100644
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -19,7 +19,6 @@ from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.tools import RuntimeTool
@@ -110,14 +109,17 @@ def process_instance(
         rewards = json.load(f)
         reward = max(rewards)
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'instance_id': env_id,
         'instruction': instruction,
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': reward,
diff --git a/evaluation/mint/env.py b/evaluation/mint/env.py
index c3ad05a00d..26e5275717 100644
--- a/evaluation/mint/env.py
+++ b/evaluation/mint/env.py
@@ -99,7 +99,6 @@ class SimplifiedEnv:
             return
 
         content = output.to_str()
-        # self.state.history.append({"role": "user", "content": content})
         self.task_state.latest_output = output.to_dict()
         self.task_state.latest_output['content'] = content
 
diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
index 2d2c7f2558..5c88287dab 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -21,7 +21,6 @@ from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 from .datatypes import TaskState
@@ -39,7 +38,7 @@ def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str,
         task=task,
         task_config=task_config,
     )
-    last_action, _ = state.history[-1]
+    last_action = state.history.get_last_action()
     result_state: TaskState = env.step(last_action.message or '')
 
     state.task_state = result_state
@@ -162,15 +161,18 @@ def process_instance(
 
     metrics = state.metrics.get() if state.metrics else None
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'id': instance.task_id,
         'instance': instance.to_dict(),
         'instruction': instruction,
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': task_state.success if task_state else False,
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
index 989f4dc660..996bf798ed 100644
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -36,7 +36,6 @@ from opendevin.core.config import config, get_llm_config_arg, get_parser
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 
@@ -195,16 +194,18 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
             logger.info(f'Output: {eval_output}')
             metrics['success'] = 1
 
+        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+        # for compatibility with the existing output format, we can remake the pairs here
+        # remove when it becomes unnecessary
+        histories = state.history.compatibility_for_eval_history_pairs()
+
         # Save the output
         output = {
             'instance_id': instance['id'],
             'repo': repo_url,
             'instruction': instruction,
             'metadata': metadata.model_dump(),
-            'history': [
-                (event_to_dict(action), event_to_dict(obs))
-                for action, obs in state.history
-            ],
+            'history': histories,
             'eval_script': eval_script_content,
             'eval_exit_code': exit_code,
             'eval_output': eval_output,
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 67c798be95..d0fd6e4563 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -25,7 +25,6 @@ from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
@@ -310,6 +309,11 @@ IMPORTANT TIPS:
 
     metrics = state.metrics.get() if state.metrics else None
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'instance_id': instance.instance_id,
@@ -317,9 +321,7 @@ IMPORTANT TIPS:
         'instruction': instruction,
         'git_patch': git_patch,  # SWE Bench specific
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': test_result,
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
index b3f50c03fd..ed2d20bdf1 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -21,7 +21,6 @@ from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
 from opendevin.events.action import MessageAction
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 
 from .utils import download_data, download_tools, encode_question, eval_answer, get_data
@@ -97,14 +96,24 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         raise ValueError('State should not be None.')
 
     model_answer_raw = ''
-    for act, _ in reversed(state.history):
-        if isinstance(act, MessageAction) and act.source == 'agent':
-            model_answer_raw = act.content
+
+    # retrieve the last message from the agent
+    for event in state.history.get_events(reverse=True):
+        if isinstance(event, MessageAction) and event.source == 'agent':
+            model_answer_raw = event.content
             break
+
     # attempt to parse model_answer
     correct = eval_answer(str(model_answer_raw), str(answer))
-    metrics = state.metrics.get() if state.metrics else None
     logger.info(f'Final message: {model_answer_raw} | Correctness: {correct}')
+
+    metrics = state.metrics.get() if state.metrics else None
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'qid': qid,
@@ -113,9 +122,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         'answer_id': 'None',
         'model_id': metadata.model_name,
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
     }
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 0b6c36d13b..e80c0f963a 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -49,16 +49,20 @@ def codeact_user_response(
         f'{encaps_str}'
         'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
     )
+
     if state.history:
+        # check if the last action has an answer, if so, early exit
         if try_parse is not None:
-            last_action, _ = state.history[-1]
+            last_action = state.history.get_last_action()
             ans = try_parse(last_action)
             if ans is not None:
                 return '/exit'
+
+        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
-            action
-            for action, _ in state.history
-            if isinstance(action, MessageAction) and action.source == 'user'
+            event
+            for event in state.history.get_events()
+            if isinstance(event, MessageAction) and event.source == 'user'
         ]
         if len(user_msgs) >= 2:
             # let the agent know that it can give up when it has tried 3 times
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
index c3aacb9eb1..b4db517668 100644
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -19,7 +19,6 @@ from opendevin.core.config import config, get_llm_config_arg, parse_arguments
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import run_agent_controller
-from opendevin.events.serialization.event import event_to_dict
 from opendevin.llm.llm import LLM
 from opendevin.runtime.docker.ssh_box import DockerSSHBox
 from opendevin.runtime.tools import RuntimeTool
@@ -111,14 +110,17 @@ def process_instance(
         rewards = json.load(f)
         reward = max(rewards)
 
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
     # Save the output
     output = {
         'instance_id': env_id,
         'instruction': instruction,
         'metadata': metadata.model_dump(),
-        'history': [
-            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
-        ],
+        'history': histories,
         'metrics': metrics,
         'error': state.last_error if state and state.last_error else None,
         'test_result': reward,
diff --git a/opendevin/controller/agent.py b/opendevin/controller/agent.py
index bf5829c104..ead7a024f1 100644
--- a/opendevin/controller/agent.py
+++ b/opendevin/controller/agent.py
@@ -70,6 +70,7 @@ class Agent(ABC):
         to prepare the agent for restarting the instruction or cleaning up before destruction.
 
         """
+        # TODO clear history
         self._complete = False
 
     @property
diff --git a/opendevin/controller/agent_controller.py b/opendevin/controller/agent_controller.py
index 40c41ec3dc..60815d66f0 100644
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@@ -4,6 +4,7 @@ from typing import Optional, Type
 
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State, TrafficControlState
+from opendevin.controller.stuck import StuckDetector
 from opendevin.core.config import config
 from opendevin.core.exceptions import (
     LLMMalformedActionError,
@@ -94,6 +95,10 @@ class AgentController:
         )
 
         self.max_budget_per_task = max_budget_per_task
+
+        # stuck helper
+        self._stuck_detector = StuckDetector(self.state)
+
         if not is_delegate:
             self.agent_task = asyncio.create_task(self._start_step_loop())
 
@@ -114,9 +119,9 @@ class AgentController:
         """
         This error will be reported to the user and sent to the LLM next step, in the hope it can self-correct.
 
-        This method should be called for a particular type of errors:
-        - the string message should be user-friendly, it will be shown in the UI
-        - an ErrorObservation can be sent to the LLM by the agent, with the exception message, so it can self-correct next time
+        This method should be called for a particular type of errors, which have:
+        - a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
+        - an ErrorObservation that can be sent to the LLM by the agent, with the exception message, so it can self-correct next time.
         """
         self.state.last_error = message
         if exception:
@@ -126,7 +131,9 @@ class AgentController:
     async def add_history(self, action: Action, observation: Observation):
         if isinstance(action, NullAction) and isinstance(observation, NullObservation):
             return
-        self.state.history.append((action, observation))
+        logger.debug(
+            f'Adding history ({type(action).__name__} with id={action.id}, {type(observation).__name__} with id={observation.id})'
+        )
 
     async def _start_step_loop(self):
         logger.info(f'[Agent Controller {self.id}] Starting step loop...')
@@ -180,12 +187,14 @@ class AgentController:
                 logger.info(event, extra={'msg_type': 'OBSERVATION'})
             elif isinstance(event, AgentDelegateObservation):
                 await self.add_history(NullAction(), event)
+                self.state.history.on_event(event)
                 logger.info(event, extra={'msg_type': 'OBSERVATION'})
             elif isinstance(event, ErrorObservation):
                 await self.add_history(NullAction(), event)
                 logger.info(event, extra={'msg_type': 'OBSERVATION'})
 
     def reset_task(self):
+        self.almost_stuck = 0
         self.agent.reset()
 
     async def set_agent_state_to(self, new_state: AgentState):
@@ -244,7 +253,6 @@ class AgentController:
         await self.delegate.set_agent_state_to(AgentState.RUNNING)
 
     async def _step(self):
-        logger.debug(f'[Agent Controller {self.id}] Entering step method')
         if self.get_agent_state() != AgentState.RUNNING:
             await asyncio.sleep(1)
             return
@@ -347,12 +355,13 @@ class AgentController:
 
         if action.runnable:
             self._pending_action = action
-        else:
-            await self.add_history(action, NullObservation(''))
 
         if not isinstance(action, NullAction):
             self.event_stream.add_event(action, EventSource.AGENT)
 
+        if not action.runnable:
+            await self.add_history(action, NullObservation(''))
+
         await self.update_state_after_step()
         logger.info(action, extra={'msg_type': 'ACTION'})
 
@@ -373,83 +382,32 @@ class AgentController:
         else:
             self.state = state
 
+        # when restored from a previous session, the State object will have history, start_id, and end_id
+        # connect it to the event stream
+        self.state.history.set_event_stream(self.event_stream)
+
+        # if start_id was not set in State, we're starting fresh, at the top of the stream
+        start_id = self.state.start_id
+        if start_id == -1:
+            start_id = self.event_stream.get_latest_event_id() + 1
+        else:
+            logger.debug(f'AgentController {self.id} restoring from event {start_id}')
+
+        # make sure history is in sync
+        self.state.start_id = start_id
+        self.state.history.start_id = start_id
+
+        # if there was an end_id saved in State, set it in history
+        # currently not used, later useful for delegates
+        if self.state.end_id > -1:
+            self.state.history.end_id = self.state.end_id
+
     def _is_stuck(self):
         # check if delegate stuck
         if self.delegate and self.delegate._is_stuck():
             return True
 
-        # filter out MessageAction with source='user' from history
-        filtered_history = [
-            _tuple
-            for _tuple in self.state.history
-            if not (
-                isinstance(_tuple[0], MessageAction)
-                and _tuple[0].source == EventSource.USER
-            )
-        ]
-
-        if len(filtered_history) < 3:
-            return False
-
-        # FIXME rewrite this to be more readable
-
-        # Scenario 1: the same (Action, Observation) loop
-        # 3 pairs of (action, observation) to stop the agent
-        last_three_tuples = filtered_history[-3:]
-
-        if all(
-            # (Action, Observation) tuples
-            # compare the last action to the last three actions
-            self._eq_no_pid(last_three_tuples[-1][0], _tuple[0])
-            for _tuple in last_three_tuples
-        ) and all(
-            # compare the last observation to the last three observations
-            self._eq_no_pid(last_three_tuples[-1][1], _tuple[1])
-            for _tuple in last_three_tuples
-        ):
-            logger.warning('Action, Observation loop detected')
-            return True
-
-        if len(filtered_history) < 4:
-            return False
-
-        last_four_tuples = filtered_history[-4:]
-
-        # Scenario 2: (action, error) pattern, not necessary identical error
-        # 4 pairs of (action, error) to stop the agent
-        if all(
-            self._eq_no_pid(last_four_tuples[-1][0], _tuple[0])
-            for _tuple in last_four_tuples
-        ):
-            # It repeats the same action, give it a chance, but not if:
-            if all(
-                isinstance(_tuple[1], ErrorObservation) for _tuple in last_four_tuples
-            ):
-                logger.warning('Action, ErrorObservation loop detected')
-                return True
-
-        # check if the agent repeats the same (Action, Observation)
-        # every other step in the last six tuples
-        # step1 = step3 = step5
-        # step2 = step4 = step6
-        if len(filtered_history) >= 6:
-            last_six_tuples = filtered_history[-6:]
-            if (
-                # this pattern is every other step, like:
-                # (action_1, obs_1), (action_2, obs_2), (action_1, obs_1), (action_2, obs_2),...
-                self._eq_no_pid(last_six_tuples[-1][0], last_six_tuples[-3][0])
-                and self._eq_no_pid(last_six_tuples[-1][0], last_six_tuples[-5][0])
-                and self._eq_no_pid(last_six_tuples[-2][0], last_six_tuples[-4][0])
-                and self._eq_no_pid(last_six_tuples[-2][0], last_six_tuples[-6][0])
-                and self._eq_no_pid(last_six_tuples[-1][1], last_six_tuples[-3][1])
-                and self._eq_no_pid(last_six_tuples[-1][1], last_six_tuples[-5][1])
-                and self._eq_no_pid(last_six_tuples[-2][1], last_six_tuples[-4][1])
-                and self._eq_no_pid(last_six_tuples[-2][1], last_six_tuples[-6][1])
-            ):
-                logger.warning('Action, Observation pattern detected')
-                return True
-
-        return False
+        return self._stuck_detector.is_stuck()
 
     def __repr__(self):
         return (
@@ -458,13 +416,3 @@ class AgentController:
             f'state={self.state!r}, agent_task={self.agent_task!r}, '
             f'delegate={self.delegate!r}, _pending_action={self._pending_action!r})'
         )
-
-    def _eq_no_pid(self, obj1, obj2):
-        if isinstance(obj1, CmdOutputObservation) and isinstance(
-            obj2, CmdOutputObservation
-        ):
-            # for loop detection, ignore command_id, which is the pid
-            return obj1.command == obj2.command and obj1.exit_code == obj2.exit_code
-        else:
-            # this is the default comparison
-            return obj1 == obj2
diff --git a/opendevin/controller/state/state.py b/opendevin/controller/state/state.py
index 06af157b2e..06c35811e1 100644
--- a/opendevin/controller/state/state.py
+++ b/opendevin/controller/state/state.py
@@ -8,12 +8,10 @@ from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.metrics import Metrics
 from opendevin.core.schema import AgentState
 from opendevin.events.action import (
-    Action,
     MessageAction,
 )
-from opendevin.events.observation import (
-    Observation,
-)
+from opendevin.events.action.agent import AgentFinishAction
+from opendevin.memory.history import ShortTermHistory
 from opendevin.storage import get_file_store
 
 
@@ -41,7 +39,7 @@ class State:
     root_task: RootTask = field(default_factory=RootTask)
     iteration: int = 0
     max_iterations: int = 100
-    history: list[tuple[Action, Observation]] = field(default_factory=list)
+    history: ShortTermHistory = field(default_factory=ShortTermHistory)
     inputs: dict = field(default_factory=dict)
     outputs: dict = field(default_factory=dict)
     last_error: str | None = None
@@ -51,10 +49,15 @@ class State:
     metrics: Metrics = Metrics()
     # root agent has level 0, and every delegate increases the level by one
     delegate_level: int = 0
+    # start_id and end_id track the range of events in history
+    start_id: int = -1
+    end_id: int = -1
+    almost_stuck: int = 0
 
     def save_to_session(self, sid: str):
         fs = get_file_store()
         pickled = pickle.dumps(self)
+        logger.debug(f'Saving state to session {sid}:{self.agent_state}')
         encoded = base64.b64encode(pickled).decode('utf-8')
         try:
             fs.write(f'sessions/{sid}/agent_state.pkl', encoded)
@@ -79,10 +82,42 @@ class State:
         state.agent_state = AgentState.LOADING
         return state
 
+    def __getstate__(self):
+        state = self.__dict__.copy()
+
+        # save the relevant data from recent history
+        # so that we can restore it when the state is restored
+        if 'history' in state:
+            state['start_id'] = state['history'].start_id
+            state['end_id'] = state['history'].end_id
+
+        # don't save history object itself
+        state.pop('history', None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+        # recreate the history object
+        if not hasattr(self, 'history'):
+            self.history = ShortTermHistory()
+
+        # restore the relevant data in history from the state
+        self.history.start_id = self.start_id
+        self.history.end_id = self.end_id
+
+        # remove the restored data from the state if any
+
     def get_current_user_intent(self):
-        # TODO: this is used to understand the user's main goal, but it's possible
-        # the latest message is an interruption. We should look for a space where
-        # the agent goes to FINISHED, and then look for the next user message.
-        for action, obs in reversed(self.history):
-            if isinstance(action, MessageAction) and action.source == 'user':
-                return action.content
+        """
+        Returns the latest user message that appears after a FinishAction, or the first (the task) if nothing was finished yet.
+        """
+        last_user_message = None
+        for event in self.history.get_events(reverse=True):
+            if isinstance(event, MessageAction) and event.source == 'user':
+                last_user_message = event.content
+            elif isinstance(event, AgentFinishAction):
+                if last_user_message is not None:
+                    return last_user_message
+
+        return last_user_message
diff --git a/opendevin/controller/stuck.py b/opendevin/controller/stuck.py
new file mode 100644
index 0000000000..8ac4397f1c
--- /dev/null
+++ b/opendevin/controller/stuck.py
@@ -0,0 +1,237 @@
+from typing import cast
+
+from opendevin.controller.state.state import State
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.events.action.action import Action
+from opendevin.events.action.empty import NullAction
+from opendevin.events.action.message import MessageAction
+from opendevin.events.event import Event, EventSource
+from opendevin.events.observation.commands import (
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+)
+from opendevin.events.observation.empty import NullObservation
+from opendevin.events.observation.error import ErrorObservation
+from opendevin.events.observation.observation import Observation
+
+
+class StuckDetector:
+    def __init__(self, state: State):
+        self.state = state
+
+    def is_stuck(self):
+        # filter out MessageAction with source='user' from history
+        filtered_history = [
+            event
+            for event in self.state.history.get_events()
+            if not (
+                (isinstance(event, MessageAction) and event.source == EventSource.USER)
+                or
+                # there might be some NullAction or NullObservation in the history at least for now
+                isinstance(event, NullAction)
+                or isinstance(event, NullObservation)
+            )
+        ]
+
+        # it takes 3 actions minimum to detect a loop, otherwise nothing to do here
+        if len(filtered_history) < 3:
+            return False
+
+        # the first few scenarios detect 3 or 4 repeated steps
+        # prepare the last 4 actions and observations, to check them out
+        last_actions: list[Event] = []
+        last_observations: list[Event] = []
+
+        # retrieve the last four actions and observations starting from the end of history, wherever they are
+        for event in reversed(filtered_history):
+            if isinstance(event, Action) and len(last_actions) < 4:
+                last_actions.append(event)
+            elif isinstance(event, Observation) and len(last_observations) < 4:
+                last_observations.append(event)
+
+            if len(last_actions) == 4 and len(last_observations) == 4:
+                break
+
+        # scenario 1: same action, same observation
+        if self._is_stuck_repeating_action_observation(last_actions, last_observations):
+            return True
+
+        # scenario 2: same action, errors
+        if self._is_stuck_repeating_action_error(last_actions, last_observations):
+            return True
+
+        # scenario 3: monologue
+        if self._is_stuck_monologue(filtered_history):
+            return True
+
+        # scenario 4: action, observation pattern on the last six steps
+        if len(filtered_history) < 6:
+            return False
+        if self._is_stuck_action_observation_pattern(filtered_history):
+            return True
+
+        return False
+
+    def _is_stuck_repeating_action_observation(self, last_actions, last_observations):
+        # scenario 1: same action, same observation
+        # it takes 4 actions and 4 observations to detect a loop
+        # assert len(last_actions) == 4 and len(last_observations) == 4
+
+        # reset almost_stuck reminder
+        self.state.almost_stuck = 0
+
+        # almost stuck? if two actions, obs are the same, we're almost stuck
+        if len(last_actions) >= 2 and len(last_observations) >= 2:
+            actions_equal = all(
+                self._eq_no_pid(last_actions[0], action) for action in last_actions[:2]
+            )
+            observations_equal = all(
+                self._eq_no_pid(last_observations[0], observation)
+                for observation in last_observations[:2]
+            )
+
+            # the last two actions and obs are the same?
+            if actions_equal and observations_equal:
+                self.state.almost_stuck = 2
+
+            # the last three actions and observations are the same?
+            if len(last_actions) >= 3 and len(last_observations) >= 3:
+                if (
+                    actions_equal
+                    and observations_equal
+                    and self._eq_no_pid(last_actions[0], last_actions[2])
+                    and self._eq_no_pid(last_observations[0], last_observations[2])
+                ):
+                    self.state.almost_stuck = 1
+
+            if len(last_actions) == 4 and len(last_observations) == 4:
+                if (
+                    actions_equal
+                    and observations_equal
+                    and self._eq_no_pid(last_actions[0], last_actions[3])
+                    and self._eq_no_pid(last_observations[0], last_observations[3])
+                ):
+                    logger.warning('Action, Observation loop detected')
+                    self.state.almost_stuck = 0
+                    return True
+
+        return False
+
+    def _is_stuck_repeating_action_error(self, last_actions, last_observations):
+        # scenario 2: same action, errors
+        # it takes 4 actions and 4 observations to detect a loop
+        # check if the last four actions are the same and result in errors
+
+        # are the last four actions the same?
+        if len(last_actions) == 4 and all(
+            self._eq_no_pid(last_actions[0], action) for action in last_actions
+        ):
+            # and the last four observations all errors?
+            if all(isinstance(obs, ErrorObservation) for obs in last_observations):
+                logger.warning('Action, ErrorObservation loop detected')
+                return True
+            # or, are the last four observations all IPythonRunCellObservation with SyntaxError?
+            elif all(
+                isinstance(obs, IPythonRunCellObservation) for obs in last_observations
+            ) and all(
+                cast(IPythonRunCellObservation, obs)
+                .content[-100:]
+                .find('SyntaxError: unterminated string literal (detected at line')
+                != -1
+                and len(
+                    cast(IPythonRunCellObservation, obs).content.split(
+                        'SyntaxError: unterminated string literal (detected at line'
+                    )[-1]
+                )
+                < 10
+                for obs in last_observations
+            ):
+                logger.warning('Action, IPythonRunCellObservation loop detected')
+                return True
+        return False
+
+    def _is_stuck_monologue(self, filtered_history):
+        # scenario 3: monologue
+        # check for repeated MessageActions with source=AGENT
+        # see if the agent is engaged in a good old monologue, telling itself the same thing over and over
+        agent_message_actions = [
+            (i, event)
+            for i, event in enumerate(filtered_history)
+            if isinstance(event, MessageAction) and event.source == EventSource.AGENT
+        ]
+
+        # last three message actions will do for this check
+        if len(agent_message_actions) >= 3:
+            last_agent_message_actions = agent_message_actions[-3:]
+
+            if all(
+                (last_agent_message_actions[0][1] == action[1])
+                for action in last_agent_message_actions
+            ):
+                # check if there are any observations between the repeated MessageActions
+                # then it's not yet a loop, maybe it can recover
+                start_index = last_agent_message_actions[0][0]
+                end_index = last_agent_message_actions[-1][0]
+
+                has_observation_between = False
+                for event in filtered_history[start_index + 1 : end_index]:
+                    if isinstance(event, Observation):
+                        has_observation_between = True
+                        break
+
+                if not has_observation_between:
+                    logger.warning('Repeated MessageAction with source=AGENT detected')
+                    return True
+        return False
+
+    def _is_stuck_action_observation_pattern(self, filtered_history):
+        # scenario 4: action, observation pattern on the last six steps
+        # check if the agent repeats the same (Action, Observation)
+        # every other step in the last six steps
+        last_six_actions: list[Event] = []
+        last_six_observations: list[Event] = []
+
+        # the end of history is most interesting
+        for event in reversed(filtered_history):
+            if isinstance(event, Action) and len(last_six_actions) < 6:
+                last_six_actions.append(event)
+            elif isinstance(event, Observation) and len(last_six_observations) < 6:
+                last_six_observations.append(event)
+
+            if len(last_six_actions) == 6 and len(last_six_observations) == 6:
+                break
+
+        # this pattern is every other step, like:
+        # (action_1, obs_1), (action_2, obs_2), (action_1, obs_1), (action_2, obs_2),...
+        if len(last_six_actions) == 6 and len(last_six_observations) == 6:
+            actions_equal = (
+                # action_0 == action_2 == action_4
+                self._eq_no_pid(last_six_actions[0], last_six_actions[2])
+                and self._eq_no_pid(last_six_actions[0], last_six_actions[4])
+                # action_1 == action_3 == action_5
+                and self._eq_no_pid(last_six_actions[1], last_six_actions[3])
+                and self._eq_no_pid(last_six_actions[1], last_six_actions[5])
+            )
+            observations_equal = (
+                # obs_0 == obs_2 == obs_4
+                self._eq_no_pid(last_six_observations[0], last_six_observations[2])
+                and self._eq_no_pid(last_six_observations[0], last_six_observations[4])
+                # obs_1 == obs_3 == obs_5
+                and self._eq_no_pid(last_six_observations[1], last_six_observations[3])
+                and self._eq_no_pid(last_six_observations[1], last_six_observations[5])
+            )
+
+            if actions_equal and observations_equal:
+                logger.warning('Action, Observation pattern detected')
+                return True
+        return False
+
+    def _eq_no_pid(self, obj1, obj2):
+        if isinstance(obj1, CmdOutputObservation) and isinstance(
+            obj2, CmdOutputObservation
+        ):
+            # for loop detection, ignore command_id, which is the pid
+            return obj1.command == obj2.command and obj1.exit_code == obj2.exit_code
+        else:
+            # this is the default comparison
+            return obj1 == obj2
diff --git a/opendevin/core/logger.py b/opendevin/core/logger.py
index b2eaae0aa7..017c7a1d83 100644
--- a/opendevin/core/logger.py
+++ b/opendevin/core/logger.py
@@ -164,10 +164,9 @@ def log_uncaught_exceptions(ex_cls, ex, tb):
 sys.excepthook = log_uncaught_exceptions
 
 opendevin_logger = logging.getLogger('opendevin')
+opendevin_logger.setLevel(logging.INFO)
 if config.debug:
     opendevin_logger.setLevel(logging.DEBUG)
-else:
-    opendevin_logger.setLevel(logging.INFO)
 opendevin_logger.addHandler(get_file_handler())
 opendevin_logger.addHandler(get_console_handler())
 opendevin_logger.addFilter(SensitiveDataFilter(opendevin_logger.name))
diff --git a/opendevin/events/event.py b/opendevin/events/event.py
index a36f584a4c..7cf6d4accd 100644
--- a/opendevin/events/event.py
+++ b/opendevin/events/event.py
@@ -17,7 +17,7 @@ class Event:
         return ''
 
     @property
-    def id(self) -> int | None:
+    def id(self) -> int:
         if hasattr(self, '_id'):
             return self._id  # type: ignore[attr-defined]
         return -1
diff --git a/opendevin/events/stream.py b/opendevin/events/stream.py
index 1284473c36..15f09e7a73 100644
--- a/opendevin/events/stream.py
+++ b/opendevin/events/stream.py
@@ -41,8 +41,11 @@ class EventStream:
         try:
             events = self._file_store.list(f'sessions/{self.sid}/events')
         except FileNotFoundError:
-            logger.warning(f'No events found for session {self.sid}')
+            logger.debug(f'No events found for session {self.sid}')
+            self._cur_id = 0
             return
+
+        # if we have events, we need to find the highest id to prepare for new events
         for event_str in events:
             id = self._get_id_from_filename(event_str)
             if id >= self._cur_id:
@@ -59,17 +62,41 @@ class EventStream:
             logger.warning(f'get id from filename ({filename}) failed.')
             return -1
 
-    def get_events(self, start_id=0, end_id=None) -> Iterable[Event]:
-        event_id = start_id
-        while True:
-            if end_id is not None and event_id > end_id:
-                break
-            try:
-                event = self.get_event(event_id)
-            except FileNotFoundError:
-                break
-            yield event
-            event_id += 1
+    def get_events(
+        self,
+        start_id=0,
+        end_id=None,
+        reverse=False,
+        filter_out_type: tuple[type[Event], ...] | None = None,
+    ) -> Iterable[Event]:
+        if reverse:
+            if end_id is None:
+                end_id = self._cur_id - 1
+            event_id = end_id
+            while event_id >= start_id:
+                try:
+                    event = self.get_event(event_id)
+                    if filter_out_type is None or not isinstance(
+                        event, filter_out_type
+                    ):
+                        yield event
+                except FileNotFoundError:
+                    logger.debug(f'No event found for ID {event_id}')
+                event_id -= 1
+        else:
+            event_id = start_id
+            while True:
+                if end_id is not None and event_id > end_id:
+                    break
+                try:
+                    event = self.get_event(event_id)
+                    if filter_out_type is None or not isinstance(
+                        event, filter_out_type
+                    ):
+                        yield event
+                except FileNotFoundError:
+                    break
+                event_id += 1
 
     def get_event(self, id: int) -> Event:
         filename = self._get_filename_for_id(id)
@@ -77,6 +104,12 @@ class EventStream:
         data = json.loads(content)
         return event_from_dict(data)
 
+    def get_latest_event(self) -> Event:
+        return self.get_event(self._cur_id - 1)
+
+    def get_latest_event_id(self) -> int:
+        return self._cur_id - 1
+
     def subscribe(self, id: EventStreamSubscriber, callback: Callable, append=False):
         if id in self._subscribers:
             if append:
@@ -99,8 +132,8 @@ class EventStream:
             event._id = self._cur_id  # type: ignore [attr-defined]
             self._cur_id += 1
         logger.debug(f'Adding {type(event).__name__} id={event.id} from {source.name}')
-        event._timestamp = datetime.now()  # type: ignore[attr-defined]
-        event._source = source  # type: ignore[attr-defined]
+        event._timestamp = datetime.now()  # type: ignore [attr-defined]
+        event._source = source  # type: ignore [attr-defined]
         data = event_to_dict(event)
         if event.id is not None:
             self._file_store.write(
@@ -109,3 +142,14 @@ class EventStream:
         for stack in self._subscribers.values():
             callback = stack[-1]
             asyncio.create_task(callback(event))
+
+    def filtered_events_by_source(self, source: EventSource):
+        for event in self.get_events():
+            if event.source == source:
+                yield event
+
+    def clear(self):
+        self._file_store.delete(f'sessions/{self.sid}')
+        self._cur_id = 0
+        # self._subscribers = {}
+        self._reinitialize_from_file_store()
diff --git a/opendevin/memory/README.md b/opendevin/memory/README.md
new file mode 100644
index 0000000000..7ef4f762b0
--- /dev/null
+++ b/opendevin/memory/README.md
@@ -0,0 +1,23 @@
+# Memory Component
+
+- Short Term History
+- Memory Condenser
+- Long Term Memory
+
+## Short Term History
+- Short term history filters the event stream and computes the messages that are injected into the context
+- It filters out certain events of no interest for the Agent, such as AgentChangeStateObservation or NullAction/NullObservation
+- When the context window or the token limit set by the user is exceeded, history starts condensing: chunks of messages into summaries.
+- Each summary is then injected into the context, in the place of the respective chunk it summarizes
+
+## Memory Condenser
+- Memory condenser is responsible for summarizing the chunks of events
+- It summarizes the earlier events first
+- It starts with the earliest agent actions and observations between two user messages
+- Then it does the same for later chunks of events between user messages
+- If there are no more agent events, it summarizes the user messages, this time one by one, if they're large enough and not immediately after an AgentFinishAction event (we assume those are tasks, potentially important)
+- Summaries are retrieved from the LLM as AgentSummarizeAction, and are saved in State.
+
+## Long Term Memory
+- Long term memory component stores embeddings for events and prompts in a vector store
+- The agent can query it when it needs detailed information about a past event or to learn new actions
diff --git a/opendevin/memory/history.py b/opendevin/memory/history.py
index e9d20b5f22..30e405c85e 100644
--- a/opendevin/memory/history.py
+++ b/opendevin/memory/history.py
@@ -1,54 +1,257 @@
-import opendevin.core.utils.json as json
-from opendevin.core.exceptions import AgentEventTypeError
+from typing import ClassVar, Iterable
+
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.events.action.action import Action
+from opendevin.events.action.agent import (
+    AgentDelegateAction,
+    ChangeAgentStateAction,
+)
+from opendevin.events.action.empty import NullAction
+from opendevin.events.action.message import MessageAction
+from opendevin.events.event import Event, EventSource
+from opendevin.events.observation.agent import AgentStateChangedObservation
+from opendevin.events.observation.commands import CmdOutputObservation
+from opendevin.events.observation.delegate import AgentDelegateObservation
+from opendevin.events.observation.empty import NullObservation
+from opendevin.events.observation.observation import Observation
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.events.stream import EventStream
 
 
-class ShortTermHistory:
+class ShortTermHistory(list[Event]):
     """
-    The short term history is the most recent series of events.
-    An agent can send this in the prompt or use it for other purpose.
+    A list of events that represents the short-term memory of the agent.
+
+    This class provides methods to retrieve and filter the events in the history of the running agent from the event stream.
     """
 
+    start_id: int
+    end_id: int
+    _event_stream: EventStream
+    delegates: dict[tuple[int, int], tuple[str, str]]
+    filter_out: ClassVar[tuple[type[Event], ...]] = (
+        NullAction,
+        NullObservation,
+        ChangeAgentStateAction,
+        AgentStateChangedObservation,
+    )
+
     def __init__(self):
-        """
-        Initialize the empty list of events
-        """
-        self.events = []
+        super().__init__()
+        self.start_id = -1
+        self.end_id = -1
+        self.delegates = {}
 
-    def add_event(self, event_dict: dict):
-        """
-        Adds an event to memory if it is a valid event.
+    def set_event_stream(self, event_stream: EventStream):
+        self._event_stream = event_stream
 
-        Parameters:
-        - event_dict (dict): The event that we want to add to memory
-
-        Raises:
-        - AgentEventTypeError: If event_dict is not a dict
+    def get_events_as_list(self) -> list[Event]:
         """
-        if not isinstance(event_dict, dict):
-            raise AgentEventTypeError()
-        self.events.append(event_dict)
-
-    def get_events(self):
+        Return the history as a list of Event objects.
         """
-        Get the events in the agent's recent history.
+        return list(self.get_events())
 
-        Returns:
-        - List: The list of events that the agent remembers easily.
+    def get_events(self, reverse: bool = False) -> Iterable[Event]:
         """
-        return self.events
+        Return the events as a stream of Event objects.
+        """
+        # TODO handle AgentRejectAction, if it's not part of a chunk ending with an AgentDelegateObservation
+        # or even if it is, because currently we don't add it to the summary
 
-    def get_total_length(self):
-        """
-        Gives the total number of characters in all history
+        # iterate from start_id to end_id, or reverse
+        start_id = self.start_id if self.start_id != -1 else 0
+        end_id = (
+            self.end_id
+            if self.end_id != -1
+            else self._event_stream.get_latest_event_id()
+        )
 
-        Returns:
-        - Int: Total number of characters of the recent history.
+        for event in self._event_stream.get_events(
+            start_id=start_id,
+            end_id=end_id,
+            reverse=reverse,
+            filter_out_type=self.filter_out,
+        ):
+            # TODO add summaries
+            # and filter out events that were included in a summary
+
+            # filter out the events from a delegate of the current agent
+            if not any(
+                # except for the delegate action and observation themselves, currently
+                # AgentDelegateAction has id = delegate_start
+                # AgentDelegateObservation has id = delegate_end
+                delegate_start < event.id < delegate_end
+                for delegate_start, delegate_end in self.delegates.keys()
+            ):
+                yield event
+
+    def get_last_action(self, end_id: int = -1) -> Action | None:
         """
-        total_length = 0
-        for t in self.events:
-            try:
-                total_length += len(json.dumps(t))
-            except TypeError as e:
-                logger.error('Error serializing event: %s', str(e), exc_info=False)
-        return total_length
+        Return the last action from the event stream, filtered to exclude unwanted events.
+        """
+        # from end_id in reverse, find the first action
+        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
+
+        last_action = next(
+            (
+                event
+                for event in self._event_stream.get_events(
+                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
+                )
+                if isinstance(event, Action)
+            ),
+            None,
+        )
+
+        return last_action
+
+    def get_last_observation(self, end_id: int = -1) -> Observation | None:
+        """
+        Return the last observation from the event stream, filtered to exclude unwanted events.
+        """
+        # from end_id in reverse, find the first observation
+        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
+
+        last_observation = next(
+            (
+                event
+                for event in self._event_stream.get_events(
+                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
+                )
+                if isinstance(event, Observation)
+            ),
+            None,
+        )
+
+        return last_observation
+
+    def get_last_user_message(self) -> str:
+        """
+        Return the latest user message from the event stream.
+        """
+
+        last_user_message = next(
+            (
+                event.content
+                for event in self._event_stream.get_events(reverse=True)
+                if isinstance(event, MessageAction) and event.source == EventSource.USER
+            ),
+            None,
+        )
+
+        return last_user_message if last_user_message is not None else ''
+
+    def get_last_events(self, n: int) -> list[Event]:
+        """
+        Return the last n events from the event stream.
+        """
+        # dummy agent is using this
+        # it should work, but it's not great to store temporary lists now just for a test
+        end_id = self._event_stream.get_latest_event_id()
+        start_id = max(0, end_id - n + 1)
+
+        return list(
+            event
+            for event in self._event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                filter_out_type=self.filter_out,
+            )
+        )
+
+    def on_event(self, event: Event):
+        if not isinstance(event, AgentDelegateObservation):
+            return
+
+        logger.debug('AgentDelegateObservation received')
+
+        # figure out what this delegate's actions were
+        # from the last AgentDelegateAction to this AgentDelegateObservation
+        # and save their ids as start and end ids
+        # in order to use later to exclude them from parent stream
+        # or summarize them
+        delegate_end = event.id
+        delegate_start = -1
+        delegate_agent: str = ''
+        delegate_task: str = ''
+        for prev_event in self._event_stream.get_events(
+            end_id=event.id - 1, reverse=True
+        ):
+            if isinstance(prev_event, AgentDelegateAction):
+                delegate_start = prev_event.id
+                delegate_agent = prev_event.agent
+                delegate_task = prev_event.inputs.get('task', '')
+                break
+
+        if delegate_start == -1:
+            logger.error(
+                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
+            )
+            return
+
+        self.delegates[(delegate_start, delegate_end)] = (delegate_agent, delegate_task)
+        logger.debug(
+            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
+        )
+
+    # TODO remove me when unnecessary
+    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
+    # we rebuild the pairs here
+    # for compatibility with the existing output format in evaluations
+    def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
+        history_pairs = []
+
+        for action, observation in self.get_pairs():
+            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
+
+        return history_pairs
+
+    def get_pairs(self) -> list[tuple[Action, Observation]]:
+        """
+        Return the history as a list of tuples (action, observation).
+        """
+        tuples: list[tuple[Action, Observation]] = []
+        action_map: dict[int, Action] = {}
+        observation_map: dict[int, Observation] = {}
+
+        # runnable actions are set as cause of observations
+        # (MessageAction, NullObservation) for source=USER
+        # (MessageAction, NullObservation) for source=AGENT
+        # (other_action?, NullObservation)
+        # (NullAction, CmdOutputObservation) background CmdOutputObservations
+
+        for event in self.get_events_as_list():
+            if event.id is None or event.id == -1:
+                logger.debug(f'Event {event} has no ID')
+
+            if isinstance(event, Action):
+                action_map[event.id] = event
+
+            if isinstance(event, Observation):
+                if event.cause is None or event.cause == -1:
+                    logger.debug(f'Observation {event} has no cause')
+
+                if event.cause is None:
+                    # runnable actions are set as cause of observations
+                    # NullObservations have no cause
+                    continue
+
+                observation_map[event.cause] = event
+
+        for action_id, action in action_map.items():
+            observation = observation_map.get(action_id)
+            if observation:
+                # observation with a cause
+                tuples.append((action, observation))
+            else:
+                tuples.append((action, NullObservation('')))
+
+        for cause_id, observation in observation_map.items():
+            if cause_id not in action_map:
+                if isinstance(observation, NullObservation):
+                    continue
+                if not isinstance(observation, CmdOutputObservation):
+                    logger.debug(f'Observation {observation} has no cause')
+                tuples.append((NullAction(), observation))
+
+        return tuples.copy()
diff --git a/opendevin/storage/local.py b/opendevin/storage/local.py
index 9b977d9448..c91bf75ffc 100644
--- a/opendevin/storage/local.py
+++ b/opendevin/storage/local.py
@@ -1,4 +1,7 @@
 import os
+import shutil
+
+from opendevin.core.logger import opendevin_logger as logger
 
 from .files import FileStore
 
@@ -34,5 +37,16 @@ class LocalFileStore(FileStore):
         return files
 
     def delete(self, path: str) -> None:
-        full_path = self.get_full_path(path)
-        os.remove(full_path)
+        try:
+            full_path = self.get_full_path(path)
+            if not os.path.exists(full_path):
+                logger.debug(f'Local path does not exist: {full_path}')
+                return
+            if os.path.isfile(full_path):
+                os.remove(full_path)
+                logger.debug(f'Removed local file: {full_path}')
+            elif os.path.isdir(full_path):
+                shutil.rmtree(full_path)
+                logger.debug(f'Removed local directory: {full_path}')
+        except Exception as e:
+            logger.error(f'Error clearing local file store: {str(e)}')
diff --git a/opendevin/storage/memory.py b/opendevin/storage/memory.py
index ea797ba7c6..8baa1ff69e 100644
--- a/opendevin/storage/memory.py
+++ b/opendevin/storage/memory.py
@@ -1,5 +1,7 @@
 import os
 
+from opendevin.core.logger import opendevin_logger as logger
+
 from .files import FileStore
 
 
@@ -37,4 +39,10 @@ class InMemoryFileStore(FileStore):
         return files
 
     def delete(self, path: str) -> None:
-        del self.files[path]
+        try:
+            keys_to_delete = [key for key in self.files.keys() if key.startswith(path)]
+            for key in keys_to_delete:
+                del self.files[key]
+            logger.debug(f'Cleared in-memory file store: {path}')
+        except Exception as e:
+            logger.error(f'Error clearing in-memory file store: {str(e)}')
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log
index e1facc9476..1e4fb4576c 100644
--- a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log
+++ b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_001.log
@@ -388,4 +388,4 @@ NOW, LET'S START!
 
 Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log
index 434bde5b3a..68127603ca 100644
--- a/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log
+++ b/tests/integration/mock/CodeActAgent/test_browse_internet/prompt_005.log
@@ -400,4 +400,4 @@ Certainly! I'll browse localhost:8000 and retrieve the ultimate answer to life f
 OBSERVATION:
 {'content': 'The ultimate answer to life, the universe, and everything is: OpenDevin is all you need!'}
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log
index 372c6a5cc3..7d311efcd6 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log
@@ -388,4 +388,4 @@ NOW, LET'S START!
 
 Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
index 74618a0e1d..3d16a5a1c3 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
@@ -401,4 +401,4 @@ OBSERVATION:
 ./bad.txt
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
index abfcaff903..7e41d2e86b 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_003.log
@@ -418,4 +418,4 @@ OBSERVATION:
 4|Enjoy!
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_004.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_004.log
index ba3af9492c..ec1601198a 100644
--- a/tests/integration/mock/CodeActAgent/test_edits/prompt_004.log
+++ b/tests/integration/mock/CodeActAgent/test_edits/prompt_004.log
@@ -445,4 +445,4 @@ OBSERVATION:
 [File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
 
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/prompt_001.log b/tests/integration/mock/CodeActAgent/test_ipython/prompt_001.log
index 473832f7e3..538542e83d 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython/prompt_001.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython/prompt_001.log
@@ -388,4 +388,4 @@ NOW, LET'S START!
 
 Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log b/tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log
index a7cb2b1426..6b064ba0d0 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython/prompt_002.log
@@ -403,4 +403,4 @@ OBSERVATION:
 File created successfully.
 
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log b/tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log
index dc9666bf2d..f8cbe1d680 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython/prompt_003.log
@@ -418,4 +418,4 @@ OBSERVATION:
 Content of /workspace/test.txt: hello world
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_001.log b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_001.log
index 9e8d7368b6..c19de572a7 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_001.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_001.log
@@ -388,4 +388,4 @@ NOW, LET'S START!
 
 Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log
index 5e75d582bc..424b26cc48 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_002.log
@@ -401,4 +401,4 @@ OBSERVATION:
 [Package installed successfully]
 [Kernel restarted successfully to load the package]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log
index b2176a1d4a..6924b264e5 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_003.log
@@ -417,4 +417,4 @@ OBSERVATION:
 Version written to /workspace/test.txt
 
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log
index cf0aa6cf69..a53ba152a2 100644
--- a/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log
+++ b/tests/integration/mock/CodeActAgent/test_ipython_module/prompt_004.log
@@ -430,4 +430,4 @@ OBSERVATION:
 pymsgbox version: 1.0.9
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log
index a8fe9cfcb8..0ca572f905 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_001.log
@@ -388,4 +388,4 @@ NOW, LET'S START!
 
 Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
 
-ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 14 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
index a4bba50cd8..c940c491fb 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_002.log
@@ -403,4 +403,4 @@ OBSERVATION:
 
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 13 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
index 129d0a6a39..d8cb0db30b 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_003.log
@@ -417,4 +417,4 @@ OBSERVATION:
 echo "hello"
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 12 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
index 17a8a44311..c43079410e 100644
--- a/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/CodeActAgent/test_write_simple_script/prompt_004.log
@@ -430,4 +430,4 @@ OBSERVATION:
 hello
 [Command -1 finished with exit code 0]
 
-ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>.
+ENVIRONMENT REMINDER: You have 11 turns left to complete the task. When finished reply with <finish></finish>
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log b/tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log
index 82542d5fe4..828f19341a 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/prompt_002.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'run', 'args': {'command': 'ls', 'background': False}} has the wrong arguments", "extras": {}}]]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": ""}}, {"source": "agent", "observation": "run", "content": "bad.txt", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log b/tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log
index 1f985f1bec..3f0b617a87 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/prompt_003.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'run', 'args': {'command': 'ls', 'background': False}} has the wrong arguments", "extras": {}}], [{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}]]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": ""}}, {"source": "agent", "observation": "run", "content": "bad.txt", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_004.log b/tests/integration/mock/DelegatorAgent/test_edits/prompt_004.log
index dc96c7bbfc..33b09c670a 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_004.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/prompt_004.log
@@ -11,7 +11,7 @@ Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 Here's a summary of the codebase, as it relates to this task:
 
-The codebase contains a single file named 'bad.txt' with some typos. The content of 'bad.txt' is:
+The codebase contains a single file named 'bad.txt' which has some typos. The content of 'bad.txt' is:
 
 This is a stupid typoo.
 Really?
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_005.log b/tests/integration/mock/DelegatorAgent/test_edits/prompt_005.log
index ddccbde698..a182233f5a 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_005.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/prompt_005.log
@@ -11,7 +11,7 @@ Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 Here's a summary of the codebase, as it relates to this task:
 
-The codebase contains a single file named 'bad.txt' with some typos. The content of 'bad.txt' is:
+The codebase contains a single file named 'bad.txt' which has some typos. The content of 'bad.txt' is:
 
 This is a stupid typoo.
 Really?
@@ -48,7 +48,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}]]
+[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_006.log b/tests/integration/mock/DelegatorAgent/test_edits/prompt_006.log
index 8ad76d7bdb..bff8bdd5ba 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_006.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/prompt_006.log
@@ -11,7 +11,7 @@ Fix typos in bad.txt. Do not ask me for confirmation at any point.
 
 Here's a summary of the codebase, as it relates to this task:
 
-The codebase contains a single file named 'bad.txt' with some typos. The content of 'bad.txt' is:
+The codebase contains a single file named 'bad.txt' which has some typos. The content of 'bad.txt' is:
 
 This is a stupid typoo.
 Really?
@@ -48,7 +48,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}], [{"source": "agent", "action": "write", "args": {"path": "bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "bad.txt"}}]]
+[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "bad.txt"}}, {"source": "agent", "action": "write", "args": {"path": "bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/prompt_008.log b/tests/integration/mock/DelegatorAgent/test_edits/prompt_008.log
index e98a50825e..02f06cbe73 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/prompt_008.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/prompt_008.log
@@ -39,7 +39,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n\n", "extras": {"path": "bad.txt"}}]]
+[{"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n\n", "extras": {"path": "bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_001.log b/tests/integration/mock/DelegatorAgent/test_edits/response_001.log
index e068bf0622..0a4f0f2b80 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/response_001.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/response_001.log
@@ -1,7 +1,6 @@
 {
   "action": "run",
   "args": {
-    "command": "ls",
-    "background": false
+    "command": "ls"
   }
 }
diff --git a/tests/integration/mock/DelegatorAgent/test_edits/response_003.log b/tests/integration/mock/DelegatorAgent/test_edits/response_003.log
index 9c7158338d..dd5629b6e5 100644
--- a/tests/integration/mock/DelegatorAgent/test_edits/response_003.log
+++ b/tests/integration/mock/DelegatorAgent/test_edits/response_003.log
@@ -2,7 +2,7 @@
   "action": "finish",
   "args": {
     "outputs": {
-      "summary": "The codebase contains a single file named 'bad.txt' with some typos. The content of 'bad.txt' is:\n\nThis is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n\nThe engineering team needs to correct the typos in this file."
+      "summary": "The codebase contains a single file named 'bad.txt' which has some typos. The content of 'bad.txt' is:\n\nThis is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n\nThe engineering team needs to correct the typos in this file."
     }
   }
 }
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log
index 27c2c0e36a..a646e046dc 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_002.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'run', 'args': {'command': 'ls', 'background': False}} has the wrong arguments", "extras": {}}]]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": ""}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_004.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_004.log
index 512ac3b89e..73320b2483 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_004.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_004.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}]]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log
index 9d9dd5d95e..afab1bb7d4 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_005.log
@@ -41,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'run', 'args': {'command': 'bash hello.sh', 'background': False}} has the wrong arguments", "extras": {}}]]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": ""}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log
index bf0ea12636..49ccb65d13 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_006.log
@@ -3,18 +3,25 @@
 ----------
 
 # Task
-You are a quality assurance engineer. Another engineer has made changes to the
-codebase which are supposed to solve this task:
+You are a software engineer. You've inherited an existing codebase, which you
+need to modify to complete this task:
 
 Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
 
-Note the changes might have already been applied in-line. You should focus on
-validating if the task is solved, nothing else.
+
+Here's a summary of the codebase, as it relates to this task:
+
+The codebase appears to be empty. Engineers should start everything from scratch.
+
 
 ## Available Actions
 * `run` - runs a command on the command line in a Linux shell. Arguments:
   * `command` - the command to run
 
+* `write` - writes the content to a file. Arguments:
+  * `path` - the path of the file to write
+  * `content` - the content to write to the file
+
 * `read` - reads the content of a file. Arguments:
   * `path` - the path of the file to read
 
@@ -26,12 +33,7 @@ validating if the task is solved, nothing else.
   * `outputs` - a dictionary representing the outputs of your task, if any
 
 
-You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
-
-Do NOT finish until you know whether the task is complete and correct.
-When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
-If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
-explaining what the problem is.
+Do NOT finish until you have completed the tasks.
 
 ## History
 Here is a recent history of actions you've taken in service of this plan,
@@ -39,7 +41,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": ""}}, {"source": "agent", "observation": "run", "content": "", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": ""}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_007.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_007.log
index 62abd6dc6f..bf0ea12636 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_007.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_007.log
@@ -39,7 +39,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\necho 'hello'\n", "extras": {"path": "hello.sh"}}]]
+[]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_008.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_008.log
index 2865c79e34..346ccd57a4 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_008.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_008.log
@@ -39,7 +39,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\necho 'hello'\n", "extras": {"path": "hello.sh"}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'run', 'args': {'command': 'bash hello.sh', 'background': False}} has the wrong arguments", "extras": {}}]]
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'\n", "extras": {"path": "hello.sh"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_009.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_009.log
new file mode 100644
index 0000000000..e44c5d6354
--- /dev/null
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/prompt_009.log
@@ -0,0 +1,49 @@
+
+
+----------
+
+# Task
+You are a quality assurance engineer. Another engineer has made changes to the
+codebase which are supposed to solve this task:
+
+Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.
+
+Note the changes might have already been applied in-line. You should focus on
+validating if the task is solved, nothing else.
+
+## Available Actions
+* `run` - runs a command on the command line in a Linux shell. Arguments:
+  * `command` - the command to run
+
+* `read` - reads the content of a file. Arguments:
+  * `path` - the path of the file to read
+
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the thought to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
+
+* `finish` - if you're absolutely certain that you've completed your task, use the finish action to stop working. Arguments:
+  * `outputs` - a dictionary representing the outputs of your task, if any
+
+
+You must ONLY `run` commands that have no side-effects, like `ls`, `grep`, and test scripts.
+
+Do NOT finish until you know whether the task is complete and correct.
+When you're done, add a `completed` boolean to the `outputs` of the `finish` action.
+If `completed` is `false`, you MUST also provide a `summary` in the `outputs` of the `finish` action
+explaining what the problem is.
+
+## History
+Here is a recent history of actions you've taken in service of this plan,
+as well as observations you've made. This only includes the MOST RECENT
+actions and observations--more may have happened before that.
+They are time-ordered, with your most recent action at the bottom.
+
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\n# Print hello\necho 'hello'\n", "extras": {"path": "hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": ""}}, {"source": "agent", "observation": "run", "content": "hello", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
+
+## Format
+Your response MUST be in JSON format. It must be an object, and it must contain two fields:
+* `action`, which is one of the actions specified here
+* `args`, which is a map of key-value pairs, specifying the arguments for that action
+
+You MUST NOT include any other text besides the JSON response
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_001.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_001.log
index e068bf0622..0a4f0f2b80 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_001.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_001.log
@@ -1,7 +1,6 @@
 {
   "action": "run",
   "args": {
-    "command": "ls",
-    "background": false
+    "command": "ls"
   }
 }
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_003.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_003.log
index f97464936b..cad275c859 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_003.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_003.log
@@ -1,7 +1,9 @@
+```json
 {
   "action": "write",
   "args": {
     "path": "hello.sh",
-    "content": "#!/bin/bash\necho 'hello'"
+    "content": "#!/bin/bash\n\n# Print hello\necho 'hello'"
   }
 }
+```
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_004.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_004.log
index 7055344e27..e9113919bd 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_004.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_004.log
@@ -1,7 +1,6 @@
 {
   "action": "run",
   "args": {
-    "command": "bash hello.sh",
-    "background": false
+    "command": "chmod +x hello.sh"
   }
 }
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_005.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_005.log
index d260ee3b2f..da7bb97444 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_005.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_005.log
@@ -1,8 +1,8 @@
+```json
 {
-  "action": "finish",
+  "action": "run",
   "args": {
-    "outputs": {
-      "result": "hello"
-    }
+    "command": "./hello.sh"
   }
 }
+```
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_006.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_006.log
index 9ccc3df9c5..762f9fe834 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_006.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_006.log
@@ -1,6 +1,8 @@
+```json
 {
-  "action": "read",
+  "action": "finish",
   "args": {
-    "path": "hello.sh"
+    "outputs": {}
   }
 }
+```
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_007.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_007.log
index 7055344e27..58119a3819 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_007.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_007.log
@@ -1,7 +1,8 @@
+```json
 {
-  "action": "run",
+  "action": "read",
   "args": {
-    "command": "bash hello.sh",
-    "background": false
+    "path": "hello.sh"
   }
 }
+```
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_008.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_008.log
index 5a88f936cb..67183065b0 100644
--- a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_008.log
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_008.log
@@ -1,23 +1,8 @@
-The task was to create a shell script named `hello.sh` that prints "hello" without asking for any confirmation. Based on the history:
-
-1. The content of `hello.sh` was read and confirmed to contain:
-   ```bash
-   #!/bin/bash
-   echo 'hello'
-   ```
-   This script correctly prints "hello" when executed.
-
-2. The script was executed using the command `bash hello.sh`, and the output was "hello", which matches the expected output.
-
-Since the script behaves as expected and meets the requirements of the task, the task is completed successfully.
-
 ```json
 {
-  "action": "finish",
+  "action": "run",
   "args": {
-    "outputs": {
-      "completed": true
-    }
+    "command": "bash hello.sh"
   }
 }
 ```
diff --git a/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_009.log b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_009.log
new file mode 100644
index 0000000000..fdeae4780c
--- /dev/null
+++ b/tests/integration/mock/DelegatorAgent/test_write_simple_script/response_009.log
@@ -0,0 +1,10 @@
+```json
+{
+  "action": "finish",
+  "args": {
+    "outputs": {
+      "completed": true
+    }
+  }
+}
+```
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_001.log b/tests/integration/mock/ManagerAgent/test_edits/prompt_001.log
index 4774a2b556..2839256104 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_001.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/prompt_001.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}]]
+[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "wait_for_response": false}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_003.log b/tests/integration/mock/ManagerAgent/test_edits/prompt_003.log
index 3774225c1c..0c53fc0f68 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_003.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/prompt_003.log
@@ -51,7 +51,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}]]
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_004.log b/tests/integration/mock/ManagerAgent/test_edits/prompt_004.log
index 40dc39d1e6..10bee3d52c 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_004.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/prompt_004.log
@@ -51,7 +51,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}], [{"source": "agent", "action": "message", "args": {"content": "I have identified the following typos in the file 'bad.txt':\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the file with the corrected content.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}]]
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "wait_for_response": false}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_005.log b/tests/integration/mock/ManagerAgent/test_edits/prompt_005.log
index 4506c17217..d6087dbc3a 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_005.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/prompt_005.log
@@ -51,7 +51,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}], [{"source": "agent", "action": "message", "args": {"content": "I have identified the following typos in the file 'bad.txt':\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the file with the corrected content.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "write", "args": {"path": "./bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "./bad.txt"}}]]
+[{"source": "agent", "action": "read", "args": {"path": "./bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "./bad.txt"}}, {"source": "agent", "action": "message", "args": {"content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content.", "wait_for_response": false}}, {"source": "agent", "action": "write", "args": {"path": "./bad.txt", "content": "This is a stupid typo.\nReally?\nNo more typos!\nEnjoy!\n", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "./bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
diff --git a/tests/integration/mock/ManagerAgent/test_edits/prompt_006.log b/tests/integration/mock/ManagerAgent/test_edits/prompt_006.log
index 1b441a575c..9fb0e100db 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/prompt_006.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/prompt_006.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "delegate", "args": {"agent": "TypoFixerAgent", "inputs": {"task": "Fix typos in bad.txt"}, "thought": ""}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "TypoFixerAgent finishes task with summary: {'file': 'bad.txt', 'typos_fixed': [{'original': 'typoo', 'corrected': 'typo'}, {'original': 'mor', 'corrected': 'more'}]}", "extras": {"outputs": {"summary": {"file": "bad.txt", "typos_fixed": [{"original": "typoo", "corrected": "typo"}, {"original": "mor", "corrected": "more"}]}}}}]]
+[{"source": "user", "action": "message", "args": {"content": "Fix typos in bad.txt. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "TypoFixerAgent", "inputs": {"task": "Fix typos in bad.txt"}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "TypoFixerAgent finishes task with summary: {'file': './bad.txt', 'typos_fixed': [{'original': 'typoo', 'corrected': 'typo'}, {'original': 'mor', 'corrected': 'more'}]}", "extras": {"outputs": {"summary": {"file": "./bad.txt", "typos_fixed": [{"original": "typoo", "corrected": "typo"}, {"original": "mor", "corrected": "more"}]}}}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_003.log b/tests/integration/mock/ManagerAgent/test_edits/response_003.log
index 05e136d650..cd2af0f8d2 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/response_003.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/response_003.log
@@ -1,6 +1,6 @@
 {
   "action": "message",
   "args": {
-    "content": "I have identified the following typos in the file 'bad.txt':\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the file with the corrected content."
+    "content": "The content of 'bad.txt' has been read. The next step is to identify and fix the typos. Here are the identified typos:\n\n1. 'typoo' should be 'typo'\n2. 'mor' should be 'more'\n\nI will now proceed to fix these typos and overwrite the original file with the corrected content."
   }
 }
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_005.log b/tests/integration/mock/ManagerAgent/test_edits/response_005.log
index b414957c52..e762d4425e 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/response_005.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/response_005.log
@@ -3,7 +3,7 @@
   "args": {
     "outputs": {
       "summary": {
-        "file": "bad.txt",
+        "file": "./bad.txt",
         "typos_fixed": [
           {
             "original": "typoo",
diff --git a/tests/integration/mock/ManagerAgent/test_edits/response_006.log b/tests/integration/mock/ManagerAgent/test_edits/response_006.log
index 4afbff8867..ef2bab7241 100644
--- a/tests/integration/mock/ManagerAgent/test_edits/response_006.log
+++ b/tests/integration/mock/ManagerAgent/test_edits/response_006.log
@@ -3,7 +3,7 @@
   "action": "finish",
   "args": {
     "outputs": {
-      "file": "bad.txt",
+      "file": "./bad.txt",
       "typos_fixed": [
         {
           "original": "typoo",
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log
index 8eef0956c8..8ad7e8de7c 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_001.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}]]
+[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "wait_for_response": false}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log
index 85f2d2daab..d53f4a40ec 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_003.log
@@ -28,7 +28,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": ""}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": ""}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log
index 7e953c3fbe..6142847231 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_004.log
@@ -28,7 +28,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": ""}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": ""}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log
index 357bcd69f3..e66ff74908 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_005.log
@@ -28,7 +28,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": ""}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": ""}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log
index e3106b973c..0356a4b906 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/prompt_006.log
@@ -70,7 +70,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"observation": "null", "content": "", "extras": {}}], [{"source": "agent", "action": "delegate", "args": {"agent": "CommitWriterAgent", "inputs": {}, "thought": ""}}, {"observation": "null", "content": "", "extras": {}}], [{"action": "null", "args": {}}, {"source": "agent", "observation": "delegate", "content": "CommitWriterAgent finishes task with reason: Not a valid git repository.", "extras": {"outputs": {"reason": "Not a valid git repository."}}}]]
+[{"source": "user", "action": "message", "args": {"content": "Write a git commit message for the current staging area. Do not ask me for confirmation at any point.", "wait_for_response": false}}, {"source": "agent", "action": "delegate", "args": {"agent": "CommitWriterAgent", "inputs": {}, "thought": ""}}, {"source": "agent", "observation": "delegate", "content": "CommitWriterAgent finishes task with reason: Not a valid git repository.", "extras": {"outputs": {"reason": "Not a valid git repository."}}}]
 
 If the last item in the history is an error, you should try to fix it. If you
 cannot fix it, call the `reject` action.
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_003.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_003.log
index 55e1a90ead..284ec6eaf9 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_003.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_003.log
@@ -1,6 +1,8 @@
+```json
 {
   "action": "reject",
   "args": {
     "reason": "Not a valid git repository."
   }
 }
+```
diff --git a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_004.log b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_004.log
index 284ec6eaf9..55e1a90ead 100644
--- a/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_004.log
+++ b/tests/integration/mock/ManagerAgent/test_simple_task_rejection/response_004.log
@@ -1,8 +1,6 @@
-```json
 {
   "action": "reject",
   "args": {
     "reason": "Not a valid git repository."
   }
 }
-```
diff --git a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log
index 6d201a109f..92fa4bd751 100644
--- a/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log
+++ b/tests/integration/mock/PlannerAgent/test_write_simple_script/prompt_011.log
@@ -79,6 +79,14 @@ as well as observations you've made. This only includes the MOST RECENT
 ten actions--more happened before that.
 
 [
+  {
+    "source": "user",
+    "action": "message",
+    "args": {
+      "content": "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point.",
+      "wait_for_response": false
+    }
+  },
   {
     "source": "agent",
     "action": "add_task",
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index b2ef740cc4..6a5b95c195 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -14,6 +14,8 @@ from opendevin.events.action import (
     AgentFinishAction,
     AgentRejectAction,
 )
+from opendevin.events.observation.browse import BrowserOutputObservation
+from opendevin.events.observation.delegate import AgentDelegateObservation
 from opendevin.llm.llm import LLM
 
 workspace_base = os.getenv('WORKSPACE_BASE')
@@ -169,7 +171,7 @@ def test_simple_task_rejection():
     final_state: State | None = asyncio.run(run_agent_controller(agent, task))
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
-    assert isinstance(final_state.history[-1][0], AgentRejectAction)
+    assert isinstance(final_state.history.get_last_action(), AgentRejectAction)
 
 
 @pytest.mark.skipif(
@@ -229,5 +231,17 @@ def test_browse_internet(http_server):
     )
     assert final_state.agent_state == AgentState.STOPPED
     assert final_state.last_error is None
-    assert isinstance(final_state.history[-1][0], AgentFinishAction)
-    assert 'OpenDevin is all you need!' in str(final_state.history)
+
+    # last action
+    last_action = final_state.history.get_last_action()
+    assert isinstance(last_action, AgentFinishAction)
+
+    # last observation
+    last_observation = final_state.history.get_last_observation()
+    assert isinstance(
+        last_observation, (BrowserOutputObservation, AgentDelegateObservation)
+    )
+    if isinstance(last_observation, BrowserOutputObservation):
+        assert 'OpenDevin is all you need!' in last_observation.content
+    elif isinstance(last_observation, AgentDelegateObservation):
+        assert 'OpenDevin is all you need!' in last_observation.outputs['content']
diff --git a/tests/unit/test_event_stream.py b/tests/unit/test_event_stream.py
index 4612ce7772..b6519a38a8 100644
--- a/tests/unit/test_event_stream.py
+++ b/tests/unit/test_event_stream.py
@@ -1,25 +1,34 @@
 import json
 
+import pytest
+
 from opendevin.events import EventSource, EventStream
 from opendevin.events.action import NullAction
 from opendevin.events.observation import NullObservation
 
 
+@pytest.fixture
+def event_stream():
+    event_stream = EventStream('abc')
+    yield event_stream
+
+    # clear after each test
+    event_stream.clear()
+
+
 def collect_events(stream):
     return [event for event in stream.get_events()]
 
 
-def test_basic_flow():
-    stream = EventStream('abc')
-    stream.add_event(NullAction(), EventSource.AGENT)
-    assert len(collect_events(stream)) == 1
+def test_basic_flow(event_stream: EventStream):
+    event_stream.add_event(NullAction(), EventSource.AGENT)
+    assert len(collect_events(event_stream)) == 1
 
 
-def test_stream_storage():
-    stream = EventStream('def')
-    stream.add_event(NullObservation(''), EventSource.AGENT)
-    assert len(collect_events(stream)) == 1
-    content = stream._file_store.read('sessions/def/events/0.json')
+def test_stream_storage(event_stream: EventStream):
+    event_stream.add_event(NullObservation(''), EventSource.AGENT)
+    assert len(collect_events(event_stream)) == 1
+    content = event_stream._file_store.read('sessions/abc/events/0.json')
     assert content is not None
     data = json.loads(content)
     assert 'timestamp' in data
@@ -34,16 +43,15 @@ def test_stream_storage():
     }
 
 
-def test_rehydration():
-    stream1 = EventStream('es1')
-    stream1.add_event(NullObservation('obs1'), EventSource.AGENT)
-    stream1.add_event(NullObservation('obs2'), EventSource.AGENT)
-    assert len(collect_events(stream1)) == 2
+def test_rehydration(event_stream: EventStream):
+    event_stream.add_event(NullObservation('obs1'), EventSource.AGENT)
+    event_stream.add_event(NullObservation('obs2'), EventSource.AGENT)
+    assert len(collect_events(event_stream)) == 2
 
     stream2 = EventStream('es2')
     assert len(collect_events(stream2)) == 0
 
-    stream1rehydrated = EventStream('es1')
+    stream1rehydrated = EventStream('abc')
     events = collect_events(stream1rehydrated)
     assert len(events) == 2
     assert events[0].content == 'obs1'
diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index 8d9844c660..e160d9fb11 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -1,17 +1,501 @@
+import logging
 from unittest.mock import Mock, patch
 
 import pytest
 
 from opendevin.controller.agent_controller import AgentController
+from opendevin.controller.state.state import State
+from opendevin.controller.stuck import StuckDetector
 from opendevin.events.action import CmdRunAction, FileReadAction, MessageAction
+from opendevin.events.action.commands import IPythonRunCellAction
 from opendevin.events.observation import (
     CmdOutputObservation,
     FileReadObservation,
-    Observation,
 )
+from opendevin.events.observation.commands import IPythonRunCellObservation
 from opendevin.events.observation.empty import NullObservation
 from opendevin.events.observation.error import ErrorObservation
-from opendevin.events.stream import EventSource
+from opendevin.events.stream import EventSource, EventStream
+from opendevin.memory.history import ShortTermHistory
+
+
+def collect_events(stream):
+    return [event for event in stream.get_events()]
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+@pytest.fixture
+def event_stream():
+    event_stream = EventStream('asdf')
+    yield event_stream
+
+    # clear after each test
+    event_stream.clear()
+
+
+class TestStuckDetector:
+    @pytest.fixture
+    def stuck_detector(self, event_stream):
+        state = State(inputs={}, max_iterations=50)
+        # state.history = ShortTermHistory()
+        state.history.set_event_stream(event_stream)
+
+        return StuckDetector(state)
+
+    def test_history_too_short(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        message_action = MessageAction(content='Hello', wait_for_response=False)
+        message_action._source = EventSource.USER
+        observation = NullObservation(content='')
+        observation._cause = message_action.id
+        event_stream.add_event(message_action, EventSource.USER)
+        event_stream.add_event(observation, EventSource.USER)
+
+        cmd_action = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action, EventSource.AGENT)
+        cmd_observation = CmdOutputObservation(
+            command_id=1, command='ls', content='file1.txt\nfile2.txt'
+        )
+        cmd_observation._cause = cmd_action._id
+        event_stream.add_event(cmd_observation, EventSource.USER)
+
+        # stuck_detector.state.history.set_event_stream(event_stream)
+
+        assert stuck_detector.is_stuck() is False
+
+    def test_is_stuck_repeating_action_observation(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        message_action = MessageAction(content='Done', wait_for_response=False)
+        message_action._source = EventSource.USER
+
+        hello_action = MessageAction(content='Hello', wait_for_response=False)
+        hello_observation = NullObservation('')
+
+        # 2 events
+        event_stream.add_event(hello_action, EventSource.USER)
+        event_stream.add_event(hello_observation, EventSource.USER)
+
+        cmd_action_1 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        cmd_observation_1 = CmdOutputObservation(
+            content='', command='ls', command_id=cmd_action_1._id
+        )
+        cmd_observation_1._cause = cmd_action_1._id
+        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # 4 events
+
+        cmd_action_2 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        cmd_observation_2 = CmdOutputObservation(
+            content='', command='ls', command_id=cmd_action_2._id
+        )
+        cmd_observation_2._cause = cmd_action_2._id
+        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # 6 events
+
+        # random user message just because we can
+        message_null_observation = NullObservation(content='')
+        event_stream.add_event(message_action, EventSource.USER)
+        event_stream.add_event(message_null_observation, EventSource.USER)
+        # 8 events
+
+        assert stuck_detector.is_stuck() is False
+        assert stuck_detector.state.almost_stuck == 2
+
+        cmd_action_3 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        cmd_observation_3 = CmdOutputObservation(
+            content='', command='ls', command_id=cmd_action_3._id
+        )
+        cmd_observation_3._cause = cmd_action_3._id
+        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # 10 events
+
+        assert len(collect_events(event_stream)) == 10
+        assert len(list(stuck_detector.state.history.get_events())) == 8
+        assert len(stuck_detector.state.history.get_pairs()) == 5
+
+        assert stuck_detector.is_stuck() is False
+        assert stuck_detector.state.almost_stuck == 1
+
+        cmd_action_4 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_4, EventSource.AGENT)
+        cmd_observation_4 = CmdOutputObservation(
+            content='', command='ls', command_id=cmd_action_4._id
+        )
+        cmd_observation_4._cause = cmd_action_4._id
+        event_stream.add_event(cmd_observation_4, EventSource.USER)
+        # 12 events
+
+        assert len(collect_events(event_stream)) == 12
+        assert len(list(stuck_detector.state.history.get_events())) == 10
+        assert len(stuck_detector.state.history.get_pairs()) == 6
+
+        with patch('logging.Logger.warning') as mock_warning:
+            assert stuck_detector.is_stuck() is True
+            assert stuck_detector.state.almost_stuck == 0
+            mock_warning.assert_called_once_with('Action, Observation loop detected')
+
+    def test_is_stuck_repeating_action_error(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        # (action, error_observation), not necessarily the same error
+        message_action = MessageAction(content='Done', wait_for_response=False)
+        message_action._source = EventSource.USER
+
+        hello_action = MessageAction(content='Hello', wait_for_response=False)
+        hello_observation = NullObservation(content='')
+        event_stream.add_event(hello_action, EventSource.USER)
+        hello_observation._cause = hello_action._id
+        event_stream.add_event(hello_observation, EventSource.USER)
+        # 2 events
+
+        cmd_action_1 = CmdRunAction(command='invalid_command')
+        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        error_observation_1 = ErrorObservation(content='Command not found')
+        error_observation_1._cause = cmd_action_1._id
+        event_stream.add_event(error_observation_1, EventSource.USER)
+        # 4 events
+
+        cmd_action_2 = CmdRunAction(command='invalid_command')
+        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        error_observation_2 = ErrorObservation(
+            content='Command still not found or another error'
+        )
+        error_observation_2._cause = cmd_action_2._id
+        event_stream.add_event(error_observation_2, EventSource.USER)
+        # 6 events
+
+        message_null_observation = NullObservation(content='')
+        event_stream.add_event(message_action, EventSource.USER)
+        event_stream.add_event(message_null_observation, EventSource.USER)
+        # 8 events
+
+        cmd_action_3 = CmdRunAction(command='invalid_command')
+        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        error_observation_3 = ErrorObservation(content='Different error')
+        error_observation_3._cause = cmd_action_3._id
+        event_stream.add_event(error_observation_3, EventSource.USER)
+        # 10 events
+
+        cmd_action_4 = CmdRunAction(command='invalid_command')
+        event_stream.add_event(cmd_action_4, EventSource.AGENT)
+        error_observation_4 = ErrorObservation(content='Command not found')
+        error_observation_4._cause = cmd_action_4._id
+        event_stream.add_event(error_observation_4, EventSource.USER)
+        # 12 events
+
+        with patch('logging.Logger.warning') as mock_warning:
+            assert stuck_detector.is_stuck() is True
+            mock_warning.assert_called_once_with(
+                'Action, ErrorObservation loop detected'
+            )
+
+    def test_is_stuck_ipython_syntax_error(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        ipython_action_1 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_1, EventSource.AGENT)
+        ipython_observation_1 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)',
+            code='print("hello',
+        )
+        ipython_observation_1._cause = ipython_action_1._id
+        event_stream.add_event(ipython_observation_1, EventSource.USER)
+
+        ipython_action_2 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_2, EventSource.AGENT)
+        ipython_observation_2 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)',
+            code='print("hello',
+        )
+        ipython_observation_2._cause = ipython_action_2._id
+        event_stream.add_event(ipython_observation_2, EventSource.USER)
+
+        ipython_action_3 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_3, EventSource.AGENT)
+        ipython_observation_3 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 3)',
+            code='print("hello',
+        )
+        ipython_observation_3._cause = ipython_action_3._id
+        event_stream.add_event(ipython_observation_3, EventSource.USER)
+
+        ipython_action_4 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_4, EventSource.AGENT)
+        ipython_observation_4 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 2)',
+            code='print("hello',
+        )
+        ipython_observation_4._cause = ipython_action_4._id
+        event_stream.add_event(ipython_observation_4, EventSource.USER)
+
+        # stuck_detector.state.history.set_event_stream(event_stream)
+
+        last_observations = [
+            ipython_observation_1,
+            ipython_observation_2,
+            ipython_observation_3,
+            ipython_observation_4,
+        ]
+        for observation in last_observations:
+            has_string = (
+                observation.content[-100:].find(
+                    'SyntaxError: unterminated string literal (detected at line'
+                )
+                != -1
+            )
+            assert has_string
+
+            string_is_last = (
+                len(
+                    observation.content.split(
+                        'SyntaxError: unterminated string literal (detected at line'
+                    )[-1]
+                )
+                < 10
+            )
+            assert string_is_last
+
+        with patch('logging.Logger.warning') as mock_warning:
+            assert stuck_detector.is_stuck() is True
+            mock_warning.assert_called_once_with(
+                'Action, IPythonRunCellObservation loop detected'
+            )
+
+    def test_is_stuck_ipython_syntax_error_not_at_end(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        ipython_action_1 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_1, EventSource.AGENT)
+        ipython_observation_1 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nThis is some additional output',
+            code='print("hello',
+        )
+        ipython_observation_1._cause = ipython_action_1._id
+        event_stream.add_event(ipython_observation_1, EventSource.USER)
+
+        ipython_action_2 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_2, EventSource.AGENT)
+        ipython_observation_2 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nToo much output here on and on',
+            code='print("hello',
+        )
+        ipython_observation_2._cause = ipython_action_2._id
+        event_stream.add_event(ipython_observation_2, EventSource.USER)
+
+        ipython_action_3 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_3, EventSource.AGENT)
+        ipython_observation_3 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 3)\nEnough',
+            code='print("hello',
+        )
+        ipython_observation_3._cause = ipython_action_3._id
+        event_stream.add_event(ipython_observation_3, EventSource.USER)
+
+        ipython_action_4 = IPythonRunCellAction(code='print("hello')
+        event_stream.add_event(ipython_action_4, EventSource.AGENT)
+        ipython_observation_4 = IPythonRunCellObservation(
+            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 2)\nLast line of output',
+            code='print("hello',
+        )
+        ipython_observation_4._cause = ipython_action_4._id
+        event_stream.add_event(ipython_observation_4, EventSource.USER)
+
+        with patch('logging.Logger.warning') as mock_warning:
+            assert stuck_detector.is_stuck() is False
+            mock_warning.assert_not_called()
+
+    def test_is_stuck_repeating_action_observation_pattern(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        message_action = MessageAction(content='Come on', wait_for_response=False)
+        message_action._source = EventSource.USER
+        event_stream.add_event(message_action, EventSource.USER)
+        message_observation = NullObservation(content='')
+        event_stream.add_event(message_observation, EventSource.USER)
+
+        cmd_action_1 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        cmd_observation_1 = CmdOutputObservation(
+            command_id=1, command='ls', content='file1.txt\nfile2.txt'
+        )
+        cmd_observation_1._cause = cmd_action_1._id
+        event_stream.add_event(cmd_observation_1, EventSource.USER)
+
+        read_action_1 = FileReadAction(path='file1.txt')
+        event_stream.add_event(read_action_1, EventSource.AGENT)
+        read_observation_1 = FileReadObservation(
+            content='File content', path='file1.txt'
+        )
+        read_observation_1._cause = read_action_1._id
+        event_stream.add_event(read_observation_1, EventSource.USER)
+
+        cmd_action_2 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        cmd_observation_2 = CmdOutputObservation(
+            command_id=2, command='ls', content='file1.txt\nfile2.txt'
+        )
+        cmd_observation_2._cause = cmd_action_2._id
+        event_stream.add_event(cmd_observation_2, EventSource.USER)
+
+        read_action_2 = FileReadAction(path='file1.txt')
+        event_stream.add_event(read_action_2, EventSource.AGENT)
+        read_observation_2 = FileReadObservation(
+            content='File content', path='file1.txt'
+        )
+        read_observation_2._cause = read_action_2._id
+        event_stream.add_event(read_observation_2, EventSource.USER)
+
+        # one more message to break the pattern
+        message_null_observation = NullObservation(content='')
+        event_stream.add_event(message_action, EventSource.USER)
+        event_stream.add_event(message_null_observation, EventSource.USER)
+
+        cmd_action_3 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        cmd_observation_3 = CmdOutputObservation(
+            command_id=3, command='ls', content='file1.txt\nfile2.txt'
+        )
+        cmd_observation_3._cause = cmd_action_3._id
+        event_stream.add_event(cmd_observation_3, EventSource.USER)
+
+        read_action_3 = FileReadAction(path='file1.txt')
+        event_stream.add_event(read_action_3, EventSource.AGENT)
+        read_observation_3 = FileReadObservation(
+            content='File content', path='file1.txt'
+        )
+        read_observation_3._cause = read_action_3._id
+        event_stream.add_event(read_observation_3, EventSource.USER)
+
+        with patch('logging.Logger.warning') as mock_warning:
+            assert stuck_detector.is_stuck() is True
+            mock_warning.assert_called_once_with('Action, Observation pattern detected')
+
+    def test_is_stuck_not_stuck(
+        self, stuck_detector: StuckDetector, event_stream: EventStream
+    ):
+        message_action = MessageAction(content='Done', wait_for_response=False)
+        message_action._source = EventSource.USER
+
+        hello_action = MessageAction(content='Hello', wait_for_response=False)
+        event_stream.add_event(hello_action, EventSource.USER)
+        hello_observation = NullObservation(content='')
+        hello_observation._cause = hello_action._id
+        event_stream.add_event(hello_observation, EventSource.USER)
+
+        cmd_action_1 = CmdRunAction(command='ls')
+        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        cmd_observation_1 = CmdOutputObservation(
+            command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt'
+        )
+        cmd_observation_1._cause = cmd_action_1._id
+        event_stream.add_event(cmd_observation_1, EventSource.USER)
+
+        read_action_1 = FileReadAction(path='file1.txt')
+        event_stream.add_event(read_action_1, EventSource.AGENT)
+        read_observation_1 = FileReadObservation(
+            content='File content', path='file1.txt'
+        )
+        read_observation_1._cause = read_action_1._id
+        event_stream.add_event(read_observation_1, EventSource.USER)
+
+        cmd_action_2 = CmdRunAction(command='pwd')
+        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        cmd_observation_2 = CmdOutputObservation(
+            command_id=2, command='pwd', content='/home/user'
+        )
+        cmd_observation_2._cause = cmd_action_2._id
+        event_stream.add_event(cmd_observation_2, EventSource.USER)
+
+        read_action_2 = FileReadAction(path='file2.txt')
+        event_stream.add_event(read_action_2, EventSource.AGENT)
+        read_observation_2 = FileReadObservation(
+            content='Another file content', path='file2.txt'
+        )
+        read_observation_2._cause = read_action_2._id
+        event_stream.add_event(read_observation_2, EventSource.USER)
+
+        message_null_observation = NullObservation(content='')
+        event_stream.add_event(message_action, EventSource.USER)
+        event_stream.add_event(message_null_observation, EventSource.USER)
+
+        cmd_action_3 = CmdRunAction(command='pwd')
+        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        cmd_observation_3 = CmdOutputObservation(
+            command_id=cmd_action_3.id, command='pwd', content='/home/user'
+        )
+        cmd_observation_3._cause = cmd_action_3._id
+        event_stream.add_event(cmd_observation_3, EventSource.USER)
+
+        read_action_3 = FileReadAction(path='file2.txt')
+        event_stream.add_event(read_action_3, EventSource.AGENT)
+        read_observation_3 = FileReadObservation(
+            content='Another file content', path='file2.txt'
+        )
+        read_observation_3._cause = read_action_3._id
+        event_stream.add_event(read_observation_3, EventSource.USER)
+
+        assert stuck_detector.is_stuck() is False
+
+    def test_is_stuck_monologue(self, stuck_detector, event_stream):
+        # Add events to the event stream
+        message_action_1 = MessageAction(content='Hi there!')
+        event_stream.add_event(message_action_1, EventSource.USER)
+        message_action_1._source = EventSource.USER
+
+        message_action_2 = MessageAction(content='Hi there!')
+        event_stream.add_event(message_action_2, EventSource.AGENT)
+        message_action_2._source = EventSource.AGENT
+
+        message_action_3 = MessageAction(content='How are you?')
+        event_stream.add_event(message_action_3, EventSource.USER)
+        message_action_3._source = EventSource.USER
+
+        cmd_kill_action = CmdRunAction(
+            command='echo 42', thought="I'm not stuck, he's stuck"
+        )
+        event_stream.add_event(cmd_kill_action, EventSource.AGENT)
+
+        message_action_4 = MessageAction(content="I'm doing well, thanks for asking.")
+        event_stream.add_event(message_action_4, EventSource.AGENT)
+        message_action_4._source = EventSource.AGENT
+
+        message_action_5 = MessageAction(content="I'm doing well, thanks for asking.")
+        event_stream.add_event(message_action_5, EventSource.AGENT)
+        message_action_5._source = EventSource.AGENT
+
+        message_action_6 = MessageAction(content="I'm doing well, thanks for asking.")
+        event_stream.add_event(message_action_6, EventSource.AGENT)
+        message_action_6._source = EventSource.AGENT
+
+        # stuck_detector.state.history.set_event_stream(event_stream)
+
+        assert stuck_detector.is_stuck()
+
+        # Add an observation event between the repeated message actions
+        cmd_output_observation = CmdOutputObservation(
+            content='OK, I was stuck, but no more.',
+            command_id=42,
+            command='storybook',
+            exit_code=0,
+        )
+        cmd_output_observation._cause = cmd_kill_action._id
+        event_stream.add_event(cmd_output_observation, EventSource.USER)
+
+        message_action_7 = MessageAction(content="I'm doing well, thanks for asking.")
+        event_stream.add_event(message_action_7, EventSource.AGENT)
+        message_action_7._source = EventSource.AGENT
+
+        message_action_8 = MessageAction(content="I'm doing well, thanks for asking.")
+        event_stream.add_event(message_action_8, EventSource.AGENT)
+        message_action_8._source = EventSource.AGENT
+
+        assert not stuck_detector.is_stuck()
 
 
 class TestAgentController:
@@ -21,212 +505,12 @@ class TestAgentController:
         controller._is_stuck = AgentController._is_stuck.__get__(
             controller, AgentController
         )
-        controller._eq_no_pid = AgentController._eq_no_pid.__get__(
-            controller, AgentController
-        )
         controller.delegate = None
         controller.state = Mock()
-        controller.state.history = []
+        controller.state.history = ShortTermHistory()
         return controller
 
-    def test_history_too_short(self, controller):
-        controller.state.history = [
-            (
-                MessageAction(content='Hello', wait_for_response=False),
-                Observation(content='Response 1'),
-            ),
-            (
-                CmdRunAction(command='ls'),
-                CmdOutputObservation(
-                    command_id=1, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-        ]
-        assert controller._is_stuck() is False
-
-    def test_is_stuck_repeating_action_null_observation(self, controller):
-        # message actions with source USER are not considered in the stuck check
-        message_action = MessageAction(content='Done', wait_for_response=False)
-        message_action._source = EventSource.USER
-        controller.state.history = [
-            (
-                MessageAction(content='Hello', wait_for_response=False),
-                Observation(content='Response 1'),
-            ),
-            (CmdRunAction(command='ls'), NullObservation(content='')),
-            (CmdRunAction(command='ls'), NullObservation(content='')),
-            # user message shouldn't break detection
-            (message_action, NullObservation(content='')),
-            (CmdRunAction(command='ls'), NullObservation(content='')),
-            (CmdRunAction(command='ls'), NullObservation(content='')),
-        ]
-        with patch('logging.Logger.warning') as mock_warning:
-            assert controller._is_stuck() is True
-            mock_warning.assert_called_once_with('Action, Observation loop detected')
-
-    def test_is_stuck_repeating_action_error_observation(self, controller):
-        # (action, error_observation), not necessarily the same error
-        message_action = MessageAction(content='Done', wait_for_response=False)
-        message_action._source = EventSource.USER
-        controller.state.history = [
-            (
-                MessageAction(content='Hello', wait_for_response=False),
-                Observation(content='Response 1'),
-            ),
-            (
-                CmdRunAction(command='invalid_command'),
-                ErrorObservation(content='Command not found'),
-            ),
-            (
-                CmdRunAction(command='invalid_command'),
-                ErrorObservation(content='Command still not found or another error'),
-            ),
-            # user message shouldn't break detection
-            (message_action, NullObservation(content='')),
-            (
-                CmdRunAction(command='invalid_command'),
-                ErrorObservation(content='Different error'),
-            ),
-            (
-                CmdRunAction(command='invalid_command'),
-                ErrorObservation(content='Command not found'),
-            ),
-        ]
-        with patch('logging.Logger.warning') as mock_warning:
-            assert controller._is_stuck() is True
-            mock_warning.assert_called_once_with(
-                'Action, ErrorObservation loop detected'
-            )
-
-    def test_is_stuck_repeating_action_observation_pattern(self, controller):
-        # six tuples of action, observation
-        message_action = MessageAction(content='Come on', wait_for_response=False)
-        message_action._source = EventSource.USER
-        controller.state.history = [
-            (
-                message_action,
-                Observation(content=''),
-            ),
-            (
-                CmdRunAction(command='ls'),
-                CmdOutputObservation(
-                    command_id=1, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-            (
-                FileReadAction(path='file1.txt'),
-                FileReadObservation(content='File content', path='file1.txt'),
-            ),
-            (
-                CmdRunAction(command='ls'),
-                # command_id is ignored for the eq check, it's a pid
-                CmdOutputObservation(
-                    command_id=2, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-            (
-                FileReadAction(path='file1.txt'),
-                FileReadObservation(content='File content', path='file1.txt'),
-            ),
-            # insert a message just because they can, it shouldn't break detection
-            (message_action, NullObservation(content='')),
-            (
-                CmdRunAction(command='ls'),
-                CmdOutputObservation(
-                    command_id=3, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-            (
-                FileReadAction(path='file1.txt'),
-                FileReadObservation(content='File content', path='file1.txt'),
-            ),
-        ]
-        with patch('logging.Logger.warning') as mock_warning:
-            assert controller._is_stuck() is True
-            mock_warning.assert_called_once_with('Action, Observation pattern detected')
-
-    def test_is_stuck_not_stuck(self, controller):
-        message_action = MessageAction(content='Done', wait_for_response=False)
-        message_action._source = EventSource.USER
-        controller.state.history = [
-            (
-                MessageAction(content='Hello', wait_for_response=False),
-                Observation(content='Response 1'),
-            ),
-            (
-                CmdRunAction(command='ls'),
-                CmdOutputObservation(
-                    command_id=1, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-            (
-                FileReadAction(path='file1.txt'),
-                FileReadObservation(content='File content', path='file1.txt'),
-            ),
-            (
-                CmdRunAction(command='pwd'),
-                # command_id is ignored for the eq check, it's the pid
-                CmdOutputObservation(command_id=2, command='pwd', content='/home/user'),
-            ),
-            (
-                FileReadAction(path='file2.txt'),
-                Observation(content='Another file content'),
-            ),
-            # insert a message from the user
-            (message_action, NullObservation(content='')),
-            (
-                CmdRunAction(command='pwd'),
-                CmdOutputObservation(command_id=3, command='pwd', content='/home/user'),
-            ),
-            (
-                FileReadAction(path='file2.txt'),
-                Observation(content='Another file content'),
-            ),
-        ]
-        assert controller._is_stuck() is False
-
-    def test_is_stuck_three_identical_tuples(self, controller):
-        # the same (action, observation), three times
-
-        # prepare messages to interrupt things
-        message_action = MessageAction(content='Done', wait_for_response=False)
-        message_action._source = EventSource.USER
-        message_action2 = MessageAction(content='Or not done?', wait_for_response=False)
-        message_action2._source = EventSource.USER
-        controller.state.history = [
-            (
-                MessageAction(content='Hello', wait_for_response=False),
-                Observation(content='Response 1'),
-            ),
-            # message from the user shouldn't interfere with the detection
-            (message_action, NullObservation(content='')),
-            (
-                CmdRunAction(command='ls'),
-                CmdOutputObservation(
-                    command_id=1, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-            (
-                CmdRunAction(command='ls'),
-                # command_id is ignored for the eq check, it's just the pid
-                CmdOutputObservation(
-                    command_id=2, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-            # message from the user shouldn't interfere with the detection
-            (message_action2, NullObservation(content='')),
-            (
-                CmdRunAction(command='ls'),
-                CmdOutputObservation(
-                    command_id=3, command='ls', content='file1.txt\nfile2.txt'
-                ),
-            ),
-        ]
-        with patch('logging.Logger.warning') as mock_warning:
-            assert controller._is_stuck() is True
-            mock_warning.assert_called_once_with('Action, Observation loop detected')
-
-    def test_is_stuck_delegate_stuck(self, controller):
+    def test_is_stuck_delegate_stuck(self, controller: AgentController):
         controller.delegate = Mock()
         controller.delegate._is_stuck.return_value = True
         assert controller._is_stuck() is True
diff --git a/tests/unit/test_micro_agents.py b/tests/unit/test_micro_agents.py
index ea47abb48d..1f76ceccd3 100644
--- a/tests/unit/test_micro_agents.py
+++ b/tests/unit/test_micro_agents.py
@@ -2,6 +2,7 @@ import json
 import os
 from unittest.mock import MagicMock
 
+import pytest
 import yaml
 
 from agenthub.micro.registry import all_microagents
@@ -9,7 +10,17 @@ from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.events import EventSource
 from opendevin.events.action import MessageAction
-from opendevin.events.observation import NullObservation
+from opendevin.events.stream import EventStream
+from opendevin.memory.history import ShortTermHistory
+
+
+@pytest.fixture
+def event_stream():
+    event_stream = EventStream('asdf')
+    yield event_stream
+
+    # clear after each test
+    event_stream.clear()
 
 
 def test_all_agents_are_loaded():
@@ -29,7 +40,7 @@ def test_all_agents_are_loaded():
     assert agent_names == set(all_microagents.keys())
 
 
-def test_coder_agent_with_summary():
+def test_coder_agent_with_summary(event_stream: EventStream):
     """
     Coder agent should render code summary as part of prompt
     """
@@ -41,8 +52,10 @@ def test_coder_agent_with_summary():
     assert coder_agent is not None
 
     task = 'This is a dummy task'
-    history = [(MessageAction(content=task), NullObservation(''))]
-    history[0][0]._source = EventSource.USER
+    history = ShortTermHistory()
+    history.set_event_stream(event_stream)
+    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+
     summary = 'This is a dummy summary about this repo'
     state = State(history=history, inputs={'summary': summary})
     coder_agent.step(state)
@@ -55,7 +68,7 @@ def test_coder_agent_with_summary():
     assert summary in prompt
 
 
-def test_coder_agent_without_summary():
+def test_coder_agent_without_summary(event_stream: EventStream):
     """
     When there's no codebase_summary available, there shouldn't be any prompt
     about 'code summary'
@@ -68,8 +81,11 @@ def test_coder_agent_without_summary():
     assert coder_agent is not None
 
     task = 'This is a dummy task'
-    history = [(MessageAction(content=task), NullObservation(''))]
-    history[0][0]._source = EventSource.USER
+    history = ShortTermHistory()
+    history.set_event_stream(event_stream)
+    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+
+    # set state without codebase summary
     state = State(history=history)
     coder_agent.step(state)