update summarize prompt

poetry lock
Merge branch 'enyst/eventstream-state' into enyst/memory-agent
2026-04-29 03:00:45 -04:00 · 2024-10-31 12:11:39 +01:00 · 2024-10-27 09:09:16 +01:00 · 2024-10-27 09:07:46 +01:00 · 2024-10-27 09:02:01 +01:00 · 2024-10-27 08:27:46 +01:00
84 changed files with 3562 additions and 1105 deletions
--- a/config.template.toml
+++ b/config.template.toml
@@ -171,6 +171,24 @@ model = "gpt-4o"
 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true

+# maximum number of messages in a conversation, after which they are truncated or summarized
+# max_conversation_window = 10
+
+# number of results when recalling message history
+# conversation_top_k = 5
+
+# fraction of the conversation window to summarize
+# message_summary_trunc_tokens_fraction = 0.75
+
+# summary LLM
+[llm.summary]
+model = "deepseek"
+
+# default LLM
+[llm.default]
+model = "claude"
+
+
 [llm.gpt4o-mini]
 api_key = "your-api-key"
 model = "gpt-4o"
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -8,6 +8,7 @@ from evaluation.EDA.game import Q20Game, Q20GameCelebrity
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -34,7 +35,7 @@ def codeact_user_response_eda(state: State) -> str:

    # retrieve the latest model message from history
    if state.history:
-        model_guess = state.history.get_last_agent_message()
+        model_guess = state.get_last_agent_message()

    assert game is not None, 'Game is not initialized.'
    msg = game.generate_user_response(model_guess)
@@ -139,7 +140,7 @@ def process_instance(
    if state is None:
        raise ValueError('State should not be None.')

-    final_message = state.history.get_last_agent_message()
+    final_message = state.get_last_agent_message()

    logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
    test_result = game.reward()
@@ -148,7 +149,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.agent_bench.helper import (
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -242,7 +243,7 @@ def process_instance(
        raw_ans = ''

        # retrieve the last agent message or thought
-        for event in state.history.get_events(reverse=True):
+        for event in reversed(state.history):
            if event.source == 'agent':
                if isinstance(event, AgentFinishAction):
                    raw_ans = event.thought
@@ -271,7 +272,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    metrics = state.metrics.get() if state.metrics else None

--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.aider_bench.helper import (
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -250,7 +251,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -299,7 +300,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    test_result['generated'] = test_result['metadata']['1_copy_change_code']

--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -16,6 +16,7 @@ from tqdm import tqdm
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -46,7 +47,7 @@ def codeact_user_response(state: State) -> str:
        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
        user_msgs = [
            event
-            for event in state.history.get_events()
+            for event in state.history
            if isinstance(event, MessageAction) and event.source == 'user'
        ]
        if len(user_msgs) > 2:
@@ -431,7 +432,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -9,6 +9,7 @@ from datasets import load_dataset
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -89,7 +90,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # find the last delegate action
    last_delegate_action = None
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -166,7 +167,7 @@ def process_instance(

    model_answer_raw = ''
    # get the last message or thought from the agent
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
        if event.source == 'agent':
            if isinstance(event, AgentFinishAction):
                model_answer_raw = event.thought
@@ -203,7 +204,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -101,7 +102,7 @@ def process_instance(
        raise ValueError('State should not be None.')

    # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()

    # attempt to parse model_answer
    ast_eval_fn = instance['ast_eval']
@@ -114,7 +115,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    output = EvalOutput(
        instance_id=instance_id,
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -28,6 +28,7 @@ from datasets import load_dataset
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -244,7 +245,7 @@ Ok now its time to start solving the question. Good luck!
        'C': False,
        'D': False,
    }
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
        if (
            isinstance(event, AgentFinishAction)
            and event.source != 'user'
@@ -300,7 +301,7 @@ Ok now its time to start solving the question. Good luck!
        instance_id=str(instance.instance_id),
        instruction=instruction,
        metadata=metadata,
-        history=state.history.compatibility_for_eval_history_pairs(),
+        history=compatibility_for_eval_history_pairs(state.history),
        metrics=metrics,
        error=state.last_error if state and state.last_error else None,
        test_result={
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -21,6 +21,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -255,7 +256,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -122,7 +122,7 @@ def process_instance(
    # # result evaluation
    # # =============================================

-    histories = state.history.get_events()
+    histories = state.history
    test_result: TestResult = test_class.verify_result(runtime, histories)
    metrics = state.metrics.get() if state.metrics else None

--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -8,6 +8,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -225,7 +226,7 @@ def process_instance(
        raise ValueError('State should not be None.')

    final_message = ''
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
        if isinstance(event, AgentFinishAction):
            final_message = event.thought
            break
@@ -247,7 +248,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -10,6 +10,7 @@ import pandas as pd
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -152,7 +153,7 @@ def process_instance(

    # Instruction is the first message from the USER
    instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
        if isinstance(event, MessageAction):
            instruction = event.content
            break
@@ -164,7 +165,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.mint.tasks import Task
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -28,6 +29,7 @@ from openhands.core.config import (
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
+    Action,
    CmdRunAction,
    MessageAction,
 )
@@ -45,7 +47,10 @@ def codeact_user_response_mint(state: State, task: Task, task_config: dict[str,
        task=task,
        task_config=task_config,
    )
-    last_action = state.history.get_last_action()
+    last_action = next(
+        (event for event in reversed(state.history) if isinstance(event, Action)),
+        None,
+    )
    result_state: TaskState = env.step(last_action.message or '')

    state.extra_data['task_state'] = result_state
@@ -202,7 +207,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -24,6 +24,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -256,7 +257,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -430,7 +430,8 @@ def process_instance(
    if state is None:
        raise ValueError('State should not be None.')

-    histories = [event_to_dict(event) for event in state.history.get_events()]
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
    codeact_user_response,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -126,7 +127,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
        raise ValueError('State should not be None.')

    # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()

    # attempt to parse model_answer
    correct = eval_answer(str(model_answer_raw), str(answer))
@@ -137,7 +138,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -18,6 +18,9 @@ from openhands.core.logger import get_console_handler
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import Action
 from openhands.events.action.message import MessageAction
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_dict
+from openhands.events.utils import get_pairs_from_events


 class EvalMetadata(BaseModel):
@@ -120,7 +123,7 @@ def codeact_user_response(
        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
        user_msgs = [
            event
-            for event in state.history.get_events()
+            for event in state.history
            if isinstance(event, MessageAction) and event.source == 'user'
        ]
        if len(user_msgs) >= 2:
@@ -411,3 +414,18 @@ def reset_logger_for_multiprocessing(
    )
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)
+
+
+# history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
+# we rebuild the pairs here
+# for compatibility with the existing output format in evaluations
+# remove this when it's no longer necessary
+def compatibility_for_eval_history_pairs(
+    history: list[Event],
+) -> list[tuple[dict, dict]]:
+    history_pairs = []
+
+    for action, observation in get_pairs_from_events(history):
+        history_pairs.append((event_to_dict(action), event_to_dict(observation)))
+
+    return history_pairs
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -10,6 +10,7 @@ import pandas as pd
 from evaluation.utils.shared import (
    EvalMetadata,
    EvalOutput,
+    compatibility_for_eval_history_pairs,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
@@ -166,7 +167,7 @@ def process_instance(

    # Instruction is the first message from the USER
    instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
        if isinstance(event, MessageAction):
            instruction = event.content
            break
@@ -178,7 +179,7 @@ def process_instance(
    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
    # for compatibility with the existing output format, we can remake the pairs here
    # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)

    # Save the output
    output = EvalOutput(
--- a/openhands/agenthub/init.py
+++ b/openhands/agenthub/init.py
@@ -13,6 +13,7 @@ from openhands.agenthub import (  # noqa: E402
    codeact_swe_agent,
    delegator_agent,
    dummy_agent,
+    memcodeact_agent,
    planner_agent,
 )

@@ -23,6 +24,7 @@ __all__ = [
    'delegator_agent',
    'dummy_agent',
    'browsing_agent',
+    'memcodeact_agent',
 ]

 for agent in all_microagents.values():
--- a/openhands/agenthub/browsing_agent/browsing_agent.py
+++ b/openhands/agenthub/browsing_agent/browsing_agent.py
@@ -150,13 +150,13 @@ class BrowsingAgent(Agent):
        last_obs = None
        last_action = None

-        if EVAL_MODE and len(state.history.get_events_as_list()) == 1:
+        if EVAL_MODE and len(state.history) == 1:
            # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
            # initialize and retrieve the first observation by issuing an noop OP
            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
            return BrowseInteractiveAction(browser_actions='noop()')

-        for event in state.history.get_events():
+        for event in state.history:
            if isinstance(event, BrowseInteractiveAction):
                prev_actions.append(event.browser_actions)
                last_action = event
--- a/openhands/agenthub/codeact_agent/agent.yaml
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -0,0 +1,33 @@
+name: CodeActAgent
+
+# custom templates directory
+# .j2 templates will be loaded from this directory if found, if not, the default will be used
+custom_templates_dir: "user_templates"
+
+# main templates
+template:
+  system_prompt: "system_prompt"  # path to the system template file
+  agent_skills: "agent_skills"  # path to the agent skills template file
+  examples: "examples"  # path to the examples template file
+  user_prompt: "user_prompt"  # path to the initial user prompt template file
+
+# agent-specific variables (can be accessed within templates)
+use_tools: false  # whether to use tool-based implementations
+# tools:  # list of available tools
+#  - name: "EditTool"
+#    description: "Edits a file."
+#    usage: "Use the following format: <file_edit> [file_path] [new_file_content] </file_edit>"
+# agent skills
+agent_skills:
+  available_skills:
+    - "file_ops:open_file"
+    - "file_ops:goto_line"
+    - "file_ops:scroll_down"
+    - "file_ops:scroll_up"
+    - "file_ops:search_dir"
+    - "file_ops:search_file"
+    - "file_ops:find_file"
+    - "file_reader:parse_pdf"
+    - "file_reader:parse_docx"
+    - "file_reader:parse_latex"
+    - "file_reader:parse_pptx"
--- a/openhands/agenthub/codeact_agent/agent_skills.j2
+++ b/openhands/agenthub/codeact_agent/agent_skills.j2
@@ -0,0 +1,3 @@
+{% for skill_name in available_skills %}
+{{ get_skill_docstring(skill_name) }}
+{% endfor %}
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -5,6 +5,7 @@ from openhands.agenthub.codeact_agent.action_parser import CodeActResponseParser
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
    Action,
@@ -91,7 +92,6 @@ class CodeActAgent(Agent):

        self.prompt_manager = PromptManager(
            prompt_dir=os.path.join(os.path.dirname(__file__)),
-            agent_skills_docs=AgentSkillsRequirement.documentation,
            micro_agent=self.micro_agent,
        )

@@ -180,7 +180,8 @@ class CodeActAgent(Agent):
        else:
            # If an observation message is not returned, it will cause an error
            # when the LLM tries to return the next message
-            raise ValueError(f'Unknown observation type: {type(obs)}')
+            logger.warning(f'Unknown observation type: {type(obs)}')
+            return None

    def reset(self) -> None:
        """Resets the CodeAct Agent."""
@@ -201,8 +202,8 @@ class CodeActAgent(Agent):
        - AgentFinishAction() - end the interaction
        """
        # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
            return AgentFinishAction()

        # prepare what we want to send to the LLM
@@ -243,7 +244,7 @@ class CodeActAgent(Agent):
            ),
        ]

-        for event in state.history.get_events():
+        for event in state.history:
            # create a regular message from an event
            if isinstance(event, Action):
                message = self.get_action_message(event)
--- a/openhands/agenthub/codeact_agent/examples.j2
+++ b/openhands/agenthub/codeact_agent/examples.j2
@@ -0,0 +1,224 @@
+{% set DEFAULT_EXAMPLE %}
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me first check the current directory:
+<execute_bash>
+ls
+</execute_bash>
+
+USER:
+OBSERVATION:
+openhands@runtime:~/workspace$
+
+ASSISTANT:
+There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
+<file_edit path="/workspace/app.py" start=-1 end=-1>
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+</file_edit>
+
+USER:
+OBSERVATION:
+[New file /workspace/app.py is created.]
+(begin of changes)
+--- /workspace/app.py
+++ /workspace/app.py
+@@ -1 +1,10 @@
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+(end of changes)
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<file_edit path="/workspace/app.py" start=4 end=7>
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+    return ret
+</file_edit>
+
+USER:
+Observation:
+[Edited existing file /workspace/app.py]
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|    return ret
+9|
+10|if __name__ == '__main__':
+11|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+{% endset %}
+Here is an example of how you can interact with the environment for task solving:
+{{ DEFAULT_EXAMPLE }}
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
--- a/openhands/agenthub/codeact_agent/system_prompt.j2
+++ b/openhands/agenthub/codeact_agent/system_prompt.j2
@@ -1,11 +1,20 @@
-{% set MINIMAL_SYSTEM_PREFIX %}
-A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
+{# Core system components for the CodeAct Agent #}

+{# Base system identity and core abilities #}
+{% set SYSTEM_PREFIX %}
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
+{% endset %}
+
+{# Python execution capabilities #}
+{% set EXECUTE_PYTHON %}
 [1] The assistant can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
+{% endset %}

+{# Bash execution capabilities #}
+{% set EXECUTE_BASH %}
 [2] The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
 If a bash command returns exit code `-1`, this means the process is not yet finished.
 The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
@@ -134,27 +143,42 @@ class MyClass:
 # MyClass().z is removed
 print(MyClass().y)
 </file_edit>
-
-
 {% endset %}
-{% set BROWSING_PREFIX %}
+
+{# Web browsing #}
+{% set EXECUTE_BROWSE %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
-For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 {% endset %}
-{% set PIP_INSTALL_PREFIX %}
+
+{# Package management #}
+{% set PIP_INSTALL %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
-{% set COMMAND_DOCS %}
+
+{# Agent skills documentation #}
+{% set AGENT_SKILLS %}
+{% if use_tools %}
+{# Tool-based implementation #}
+The following tools are available:
+{% for tool in tools %}
+- {{ tool.name }}: {{ tool.description }}
+  Usage: {{ tool.usage }}
+{% endfor %}
+{% else %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
+
 IMPORTANT:
 - `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
+{% endif %}
 {% endset %}
-{% set SYSTEM_SUFFIX %}
+
+{# System behavior rules #}
+{% set GENERAL_RULES %}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
@@ -163,9 +187,12 @@ IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_bro
 The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
 The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
 The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
-
 {% endset %}
-{# Combine all parts without newlines between them #}
+{# Combine all blocks without newlines #}
 {{ SYSTEM_PREFIX -}}
-{{- COMMAND_DOCS -}}
-{{- SYSTEM_SUFFIX }}
+{{- EXECUTE_PYTHON -}}
+{{- EXECUTE_BASH -}}
+{{- EXECUTE_BROWSE -}}
+{{- PIP_INSTALL -}}
+{{- AGENT_SKILLS -}}
+{{- GENERAL_RULES -}}
--- a/openhands/agenthub/codeact_agent/user_prompt.j2
+++ b/openhands/agenthub/codeact_agent/user_prompt.j2
@@ -1,220 +1,5 @@
-{% set DEFAULT_EXAMPLE %}
--- START OF EXAMPLE ---
+{{ examples }}

-USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
-
-ASSISTANT:
-Sure! Let me first check the current directory:
-<execute_bash>
-ls
-</execute_bash>
-
-USER:
-OBSERVATION:
-openhands@runtime:~/workspace$
-
-ASSISTANT:
-There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
-<file_edit path="/workspace/app.py" start=-1 end=-1>
-from flask import Flask
-app = Flask(__name__)
-
-@app.route('/')
-def index():
-    numbers = list(range(1, 11))
-    return str(numbers)
-
-if __name__ == '__main__':
-    app.run(port=5000)
-</file_edit>
-
-USER:
-OBSERVATION:
-[New file /workspace/app.py is created.]
-(begin of changes)
--- /workspace/app.py
-+++ /workspace/app.py
-@@ -1 +1,10 @@
-+from flask import Flask
-+app = Flask(__name__)
-
-+@app.route('/')
-+def index():
-+    numbers = list(range(1, 11))
-+    return str(numbers)
-+
-+if __name__ == '__main__':
-+    app.run(port=5000)
-(end of changes)
-
-ASSISTANT:
-I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
-
-ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
-<execute_bash>
-cat server.log
-</execute_bash>
-
-USER:
-OBSERVATION:
-Traceback (most recent call last):
-  File "/workspace/app.py", line 2, in <module>
-    from flask import Flask
-ModuleNotFoundError: No module named 'flask'
-
-ASSISTANT:
-It seems that Flask is not installed. Let me install Flask for you:
-<execute_bash>
-pip install flask
-</execute_bash>
-
-USER:
-OBSERVATION:
-Defaulting to user installation because normal site-packages is not writeable
-Collecting flask
-  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
-Collecting blinker>=1.6.2
-  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
-Collecting Werkzeug>=3.0.0
-  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
-Collecting click>=8.1.3
-  Using cached click-8.1.7-py3-none-any.whl (97 kB)
-Collecting itsdangerous>=2.1.2
-  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
-Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
-Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
-Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
-Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
-
-ASSISTANT:
-Now that Flask is installed, let me run the Python file again:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1] 124
-
-ASSISTANT:
-Let me check the server log again:
-<execute_bash>
-cat server.log
-</execute_bash>
-
-USER:
-OBSERVATION:
-* Serving Flask app 'app'
- * Debug mode: off
-WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
- * Running on http://127.0.0.1:5000
-Press CTRL+C to quit
-
-ASSISTANT:
-The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
-
-USER: Now browse the newly started server's homepage and show me the content.
-
-ASSISTANT:
-Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
-<execute_browse>
-Get the content on "http://127.0.0.1:5000"
-</execute_browse>
-
-USER:
-OBSERVATION:
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-ASSISTANT:
-The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
-
-USER: Now kill the server, make it display the numbers in a table format.
-
-ASSISTANT:
-Sure! Let me stop the server first:
-<execute_bash>
-kill 124
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1]+  Terminated              python3 app.py > server.log 2>&1
-
-ASSISTANT:
-The server has been stopped. Let me open the Python file and modify it.
-<execute_ipython>
-open_file('app.py')
-</execute_ipython>
-
-USER:
-[File: /workspace/app.py (10 lines total)]
-(this is the beginning of the file)
-1|from flask import Flask
-2|app = Flask(__name__)
-3|
-4|@app.route('/')
-5|def index():
-6|    numbers = list(range(1, 11))
-7|    return str(numbers)
-8|
-9|if __name__ == '__main__':
-10|    app.run(port=5000)
-(this is the end of the file)
-
-ASSISTANT:
-I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<file_edit path="/workspace/app.py" start=4 end=7>
-@app.route('/')
-def index():
-    numbers = list(range(1, 11))
-    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-    return ret
-</file_edit>
-
-USER:
-Observation:
-[Edited existing file /workspace/app.py]
-[File: /workspace/app.py (10 lines total after edit)]
-(this is the beginning of the file)
-1|from flask import Flask
-2|app = Flask(__name__)
-3|
-4|@app.route('/')
-5|def index():
-6|    numbers = list(range(1, 11))
-7|    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-8|    return ret
-9|
-10|if __name__ == '__main__':
-11|    app.run(port=5000)
-(this is the end of the file)
-[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
-
-ASSISTANT:
-Running the updated file:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-Observation:
-[1] 126
-
-ASSISTANT:
-The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
-<finish></finish>
-
--- END OF EXAMPLE ---
-{% endset %}
-Here is an example of how you can interact with the environment for task solving:
-{{ DEFAULT_EXAMPLE }}
 {% if micro_agent %}
 --- BEGIN OF GUIDELINE ---
 The following information may assist you in completing your task:
@@ -223,4 +8,5 @@ The following information may assist you in completing your task:
 --- END OF GUIDELINE ---
 {% endif %}

+
 NOW, LET'S START!
--- a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -154,8 +154,8 @@ class CodeActSWEAgent(Agent):
        - AgentFinishAction() - end the interaction
        """
        # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
            return AgentFinishAction()

        # prepare what we want to send to the LLM
@@ -176,7 +176,7 @@ class CodeActSWEAgent(Agent):
            Message(role='user', content=[TextContent(text=self.in_context_example)]),
        ]

-        for event in state.history.get_events():
+        for event in state.history:
            # create a regular message from an event
            if isinstance(event, Action):
                message = self.get_action_message(event)
--- a/openhands/agenthub/delegator_agent/agent.py
+++ b/openhands/agenthub/delegator_agent/agent.py
@@ -2,7 +2,7 @@ from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
 from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction
-from openhands.events.observation import AgentDelegateObservation
+from openhands.events.observation import AgentDelegateObservation, Observation
 from openhands.llm.llm import LLM


@@ -27,7 +27,7 @@ class DelegatorAgent(Agent):
        Otherwise, delegates the task to the next agent in the pipeline.

        Parameters:
-        - state (State): The current state given the previous actions and observations
+        - state: The current state given the previous actions and observations

        Returns:
        - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
@@ -41,7 +41,11 @@ class DelegatorAgent(Agent):
            )

        # last observation in history should be from the delegate
-        last_observation = state.history.get_last_observation()
+        last_observation = None
+        for event in reversed(state.history):
+            if isinstance(event, Observation):
+                last_observation = event
+                break

        if not isinstance(last_observation, AgentDelegateObservation):
            raise Exception('Last observation is not an AgentDelegateObservation')
--- a/openhands/agenthub/dummy_agent/agent.py
+++ b/openhands/agenthub/dummy_agent/agent.py
@@ -164,7 +164,7 @@ class DummyAgent(Agent):

            if 'observations' in prev_step and prev_step['observations']:
                expected_observations = prev_step['observations']
-                hist_events = state.history.get_last_events(len(expected_observations))
+                hist_events = state.history[-len(expected_observations) :]

                if len(hist_events) < len(expected_observations):
                    print(
--- a/openhands/agenthub/memcodeact_agent/README.md
+++ b/openhands/agenthub/memcodeact_agent/README.md
@@ -0,0 +1,38 @@
+# MemCodeAct Agent
+
+## Introduction
+
+`memcodeact_agent` is a memory-enabled experimental agent built upon the foundation of the existing `codeact_agent`, incorporating memory functionalities.
+
+## Inspiration and Research
+
+The development of `memcodeact_agent` is inspired by two research papers in the field of generative AI and memory-augmented models:
+
+1. **Extending Generative AI with Memory**
+   - **Paper:** [Extending Generative AI with Memory](https://arxiv.org/pdf/2304.03442)
+   - **Summary:** This paper explores methods to integrate long-term memory into generative AI models, enabling them to retain and utilize information from past interactions. The approach enhances the model's ability to maintain context over extended conversations, leading to more accurate and relevant outputs. Techniques such as memory slots, retrieval mechanisms, and memory encoding strategies are discussed to facilitate effective information storage and retrieval.
+
+2. **MemGPT: Memory-Enhanced GPT Models**
+   - **Paper:** [MemGPT: Memory-Enhanced GPT Models](https://arxiv.org/pdf/2310.08560)
+   - **Summary:** MemGPT introduces a novel architecture that incorporates external memory modules into GPT models. This integration allows the model to access and update its memory dynamically during interactions. The results demonstrate significant improvements in tasks requiring information recall.
+
+## Getting Started
+
+### Prerequisites
+
+- Configuration variables in `config.toml`, `agent.MemCodeactAgent` section:
+  - `micro_agent_name`: Name of the micro agent to use.
+  - `enable_memory`: Whether to enable long-term memory. Default is true for this agent.
+  - `cache_prompt`: Whether to cache the prompt. Default is false for this agent.
+
+
+- Optional environment variables:
+  - `SANDBOX_ENV_GITHUB_TOKEN`: GitHub Personal Access Token with read-only permissions.
+
+## Documentation
+
+For detailed information on how to interact with the agent, refer to the [User Prompt](user_prompt.j2) and [System Prompt](system_prompt.j2) templates located within the agent's directory. These templates define the conversational flow and the agent's capabilities.
+
+## Contribution
+
+`memcodeact_agent` is an experimental agent designed for research and development purposes. Contributions are welcome!
--- a/openhands/agenthub/memcodeact_agent/init.py
+++ b/openhands/agenthub/memcodeact_agent/init.py
@@ -0,0 +1,7 @@
+from openhands.controller.agent import Agent
+
+from .memcodeact_agent import MemCodeActAgent
+
+__all__ = ['MemCodeActAgent']
+
+Agent.register('MemCodeActAgent', MemCodeActAgent)
--- a/openhands/agenthub/memcodeact_agent/action_parser.py
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -0,0 +1,262 @@
+import re
+
+from openhands.controller.action_parser import ActionParser, ResponseParser
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
+
+
+class MemCodeActResponseParser(ResponseParser):
+    """Parser actions for MemCodeActAgent:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
+    - AgentSummarizeAction() - trigger a summarization of the conversation history
+    - AgentRecallAction(query) - recall information from memory
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.action_parsers = [
+            MemCodeActActionParserFinish(),
+            MemCodeActActionParserCmdRun(),
+            MemCodeActActionParserIPythonRunCell(),
+            MemCodeActActionParserAgentDelegate(),
+            MemCodeActActionParserMemorySummarize(),
+            MemCodeActActionParserMemoryRecall(),
+            # MemCodeActActionParserMemoryAdd(),
+        ]
+        self.default_parser = MemCodeActActionParserMessage()
+
+    def parse(self, response) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action = response.choices[0].message.content
+        if action is None:
+            return ''
+
+        # execute actions
+        for lang in ['bash', 'ipython', 'browse']:
+            # special handling for DeepSeek: it has the stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
+                action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
+
+            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+                action += f'</execute_{lang}>'
+
+        # memory actions
+        for mem in ['summarize', 'recall', 'add']:
+            # the stop-word bug
+            if f'<memory_{mem}>' in action and f'</memory_{mem}>' not in action:
+                action = action.replace(f'</memory_{mem}', f'</memory_{mem}>')
+
+            if f'<memory_{mem}>' in action and f'</memory_{mem}>' not in action:
+                action += f'</memory_{mem}>'
+
+        return action
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class MemCodeActActionParserFinish(ActionParser):
+    """Parser action:
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.finish_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.finish_command = re.search(r'<finish>.*</finish>', action_str, re.DOTALL)
+        return self.finish_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.finish_command is not None
+        ), 'self.finish_command should not be None when parse is called'
+        thought = action_str.replace(self.finish_command.group(0), '').strip()
+        return AgentFinishAction(thought=thought)
+
+
+class MemCodeActActionParserCmdRun(ActionParser):
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.bash_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.bash_command = re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        )
+        return self.bash_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.bash_command is not None
+        ), 'self.bash_command should not be None when parse is called'
+        thought = action_str.replace(self.bash_command.group(0), '').strip()
+        # a command was found
+        command_group = self.bash_command.group(1).strip()
+        if command_group.strip() == 'exit':
+            return AgentFinishAction(thought=thought)
+        return CmdRunAction(command=command_group, thought=thought)
+
+
+class MemCodeActActionParserIPythonRunCell(ActionParser):
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
+    """
+
+    def __init__(
+        self,
+    ):
+        self.python_code = None
+        self.jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    def check_condition(self, action_str: str) -> bool:
+        self.python_code = re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        )
+        return self.python_code is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.python_code is not None
+        ), 'self.python_code should not be None when parse is called'
+        code_group = self.python_code.group(1).strip()
+        thought = action_str.replace(self.python_code.group(0), '').strip()
+        return IPythonRunCellAction(
+            code=code_group,
+            thought=thought,
+            kernel_init_code=self.jupyter_kernel_init_code,
+        )
+
+
+class MemCodeActActionParserAgentDelegate(ActionParser):
+    """Parser action:
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """
+
+    def __init__(
+        self,
+    ):
+        self.agent_delegate = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.agent_delegate = re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        )
+        return self.agent_delegate is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.agent_delegate is not None
+        ), 'self.agent_delegate should not be None when parse is called'
+        thought = action_str.replace(self.agent_delegate.group(0), '').strip()
+        browse_actions = self.agent_delegate.group(1).strip()
+        task = f'{thought}. I should start with: {browse_actions}'
+        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
+
+
+class MemCodeActActionParserMessage(ActionParser):
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        # We assume the LLM is GOOD enough that when it returns pure natural language
+        # it wants to talk to the user
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        return MessageAction(content=action_str, wait_for_response=True)
+
+
+class MemCodeActActionParserMemoryRecall(ActionParser):
+    """Parser action:
+    - RecallAction(query) - memory action to run
+    """
+
+    def __init__(self):
+        self.recall_query = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.recall_query = re.search(
+            r'<memory_recall>(.*?)</memory_recall>', action_str, re.DOTALL
+        )
+        return self.recall_query is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.recall_query is not None
+        ), 'self.query should not be None when parse is called'
+
+        # thought <memory_recall>query</memory_recall>
+        # Note: the thought is optional
+        thought = action_str.replace(self.recall_query.group(0), '').strip()
+        query = self.recall_query.group(1).strip()
+        return AgentRecallAction(query=query, thought=thought)
+
+
+class MemCodeActActionParserMemorySummarize(ActionParser):
+    """Parser action:
+    - <memory_summarize> - The LLM wants to trigger a summarization of its context
+    """
+
+    def check_condition(self, action_str: str) -> bool:
+        return '<memory_summarize>' in action_str
+
+    def parse(self, action_str: str) -> Action:
+        # let the agent trigger the summarization
+        return AgentSummarizeAction(summary='')
+
+
+class MemCodeActActionParserMemoryAdd(ActionParser):
+    """Parser action:
+    - MemoryAddAction(content) - add text to core memory
+    """
+
+    def __init__(self):
+        self.content = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.content = re.search(
+            r'<memory_add>(.*?)</memory_add>', action_str, re.DOTALL
+        )
+        return self.content is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.content is not None
+        ), 'self.content should not be None when parse is called'
+
+        # <memory_add>content</memory_add>
+        thought = action_str.replace(self.content.group(0), '').strip()
+        return Action()
+        # return MemoryAddAction(content=self.content.group(1).strip(), thought=thought)
--- a/openhands/agenthub/memcodeact_agent/core_memory_prompt.md
+++ b/openhands/agenthub/memcodeact_agent/core_memory_prompt.md
@@ -0,0 +1,5 @@
+Core Memory:
+Your core memory unit will be initially empty. You can add to it important information about the task or your status. Keep it concise and remember that you will use it to guide your actions, so keep it relevant!
+You can add to your core memory using the <memory_add> action.
+For example, <memory_add> The user is working on a project to create a new AI assistant. </memory_add>
+Adding to your core memory is optional. You do NOT need to do it for every message.
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -0,0 +1,418 @@
+import os
+from itertools import islice
+
+from openhands.agenthub.memcodeact_agent.action_parser import MemCodeActResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.exceptions import TokenLimitExceededError
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
+from openhands.events.observation import (
+    AgentDelegateObservation,
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+    UserRejectObservation,
+)
+from openhands.events.observation.agent import AgentRecallObservation
+from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.observation import Observation
+from openhands.events.serialization.event import event_to_memory, truncate_content
+from openhands.llm.llm import LLM
+from openhands.memory.condenser import MemoryCondenser
+from openhands.memory.conversation_memory import ConversationMemory
+from openhands.memory.core_memory import CoreMemory
+from openhands.runtime.plugins import (
+    AgentSkillsRequirement,
+    JupyterRequirement,
+    PluginRequirement,
+)
+from openhands.utils.microagent import MicroAgent
+from openhands.utils.prompt import PromptManager
+
+
+class MemCodeActAgent(Agent):
+    VERSION = '0.1'
+    """
+    The MemCode Act Agent is a memory-enabled version of the CodeAct agent.
+
+    Its memory modules are:
+    - conversation: easy to recall memory (history)
+    - core: core system messages
+    - long_term: long-term memory
+
+    Its memory actions are:
+        - "core_memory_append"
+        - "core_memory_replace"
+        - "conversation_search"
+        - "long_term_memory_insert"
+        - "long_term_memory_search"
+        - "summarize_conversation"
+    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
+
+    ### Overview
+
+    This agent implements:
+    - the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+    - inspired by the Generative Agents idea([paper](https://arxiv.org/abs/2304.03442)) and the MemGPT idea ([paper](https://arxiv.org/abs/2310.08560))
+
+    The conceptual idea is illustrated below. At each turn, the agent can:
+
+    1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
+    2. **CodeAct**: Choose to perform the task by executing code
+        - Execute any valid Linux `bash` command
+        - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
+    3. **MemGPT**: Manage its own memory
+        - truncate its history and replace it with a summary
+        - store information in its long-term memory
+        - search for information relevant to the task.
+
+    """
+
+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions,
+        # and it needs to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+
+    action_parser = MemCodeActResponseParser()
+
+    # NOTE: memory includes 'conversation' and 'core' memory blocks
+    conversation_memory: ConversationMemory
+    core_memory: CoreMemory
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the MemCodeActAgent class.
+
+        Parameters:
+        - llm: The LLM to be used by this agent
+        - config: The agent configuration
+        """
+        super().__init__(llm, config)
+
+        self.memory_config = llm.config  # TODO this should be MemoryConfig
+
+        self.micro_agent = (
+            MicroAgent(
+                os.path.join(
+                    os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
+                )
+            )
+            if config.micro_agent_name
+            else None
+        )
+
+        self.prompt_manager = PromptManager(
+            prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
+            micro_agent=self.micro_agent,
+        )
+
+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, AgentDelegateAction):
+            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
+        elif isinstance(action, MessageAction):
+            logger.debug(f'MessageAction.content: {action.content}')
+            return action.content
+        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
+            return action.thought
+        elif isinstance(action, AgentSummarizeAction):
+            # information about the conversation history
+            hidden_message_count = self.conversation_memory.hidden_message_count
+            if hidden_message_count > 0:
+                summary_message = (
+                    f'\n\nENVIRONMENT REMINDER: prior messages ({hidden_message_count} of {self.conversation_memory.total_message_count} total messages) have been hidden from view due to conversation memory constraints.\n'
+                    + f'The following is a summary of the first {hidden_message_count} messages:\n {action.summary}'
+                )
+                return summary_message
+        elif isinstance(action, AgentRecallAction):
+            return f'{action.thought}\n<memory_recall>\n{action.query[:10]}...\n</memory_recall>'
+        return ''
+
+    def get_action_message(self, action: Action) -> Message | None:
+        if (
+            isinstance(action, AgentDelegateAction)
+            or isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+            or (isinstance(action, AgentFinishAction) and action.source == 'agent')
+            or isinstance(action, AgentSummarizeAction)
+            or isinstance(action, AgentRecallAction)
+        ):
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if (
+                self.llm.vision_is_active()
+                and isinstance(action, MessageAction)
+                and action.images_urls
+            ):
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+        return None
+
+    def get_observation_message(self, obs: Observation) -> Message | None:
+        max_message_chars = self.llm.config.max_message_chars
+        obs_prefix = 'ENVIRONMENT OBSERVATION:\n'
+        if isinstance(obs, CmdOutputObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, IPythonRunCellObservation):
+            text = obs_prefix + obs.content
+            # replace base64 images with a placeholder
+            splitted = text.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentDelegateObservation):
+            text = obs_prefix + truncate_content(
+                obs.outputs['content'] if 'content' in obs.outputs else '',
+                max_message_chars,
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, ErrorObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += '\n[Error occurred in processing last action]'
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, UserRejectObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += '\n[Last action has been rejected by the user]'
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentRecallObservation):
+            text = 'MEMORY RECALL:\n' + obs.memory
+            return Message(role='user', content=[TextContent(text=text)])
+        else:
+            # If an observation message is not returned, it will cause an error
+            # when the LLM tries to return the next message
+            logger.debug(f'Unknown observation type: {type(obs)}')
+            return None
+
+    def reset(self) -> None:
+        """Resets the MemCodeAct Agent."""
+        super().reset()
+
+        # reset the memory modules
+        self.core_memory.reset()
+        self.conversation_memory.reset()
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the MemCodeAct Agent.
+        This includes gathering info on previous steps and prompting the model to make an action to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - SummarizeAction() - summarize the conversation
+        - RecallAction() - search the agent's history
+        - LongTermMemoryInsertAction() - archive information in the long-term memory
+        - LongTermMemorySearchAction() - search the agent's long-term memory
+        - AgentFinishAction() - end the interaction
+        """
+        # if we're done, go back
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
+            return AgentFinishAction()
+
+        # initialize the memory modules
+
+        # stores and searches the agent's long-term memory (vector store)
+        # long_term_memory = LongTermMemory(llm_config=memory_config, agent_config=config, event_stream=self.event_stream)
+
+        # stores and recalls the whole agent's history
+        assert self.memory_config is not None
+
+        # update conversation memory for this step
+        if not hasattr(self, 'conversation_memory') or not self.conversation_memory:
+            self.conversation_memory = ConversationMemory(
+                memory_config=self.memory_config, state=state
+            )
+        else:
+            self.conversation_memory.update(state)
+
+        # initialize core memory
+        if not hasattr(self, 'core_memory') or not self.core_memory:
+            self.core_memory = CoreMemory(limit=1500)
+
+        # prepare what we want to send to the LLM
+        messages = self._get_messages(state)
+        params = {
+            'messages': self.llm.format_messages_for_llm(messages),
+            'stop': [
+                '</execute_ipython>',
+                '</execute_bash>',
+                '</execute_browse>',
+            ],
+        }
+
+        # catch ContextWindowExceededError and TokenLimitExceededError
+        try:
+            response = self.llm.completion(**params)
+        except TokenLimitExceededError as e:
+            logger.error(e, exc_info=False)
+
+            # run condenser directly
+            summary_action = self.summarize_messages(state)
+
+            # just return for now
+            return summary_action
+        return self.action_parser.parse(response)
+
+    def _get_messages(self, state: State) -> list[Message]:
+        # update prompt manager with current core memory
+        self.prompt_manager.core_memory = self.core_memory.format_blocks()
+
+        messages: list[Message] = [
+            Message(
+                role='system',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.system_message,
+                        cache_prompt=self.llm.is_caching_prompt_active(),
+                    )
+                ],
+                condensable=False,
+            ),
+            Message(
+                role='user',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.initial_user_message,
+                        cache_prompt=self.llm.is_caching_prompt_active(),  # the user asks the same query
+                    )
+                ],
+                condensable=False,
+            ),
+        ]
+
+        for event in self.conversation_memory.memory:
+            # if it is a summary or recall, it will not have event_id for now
+            if isinstance(event, AgentSummarizeAction):
+                message = self.get_action_message(event)
+            elif isinstance(event, AgentRecallAction):
+                message = self.get_action_message(event)
+            elif isinstance(event, AgentRecallObservation):
+                message = self.get_observation_message(event)
+            else:
+                # create a regular message from an event
+                if isinstance(event, Action):
+                    message = self.get_action_message(event)
+                elif isinstance(event, Observation):
+                    message = self.get_observation_message(event)
+                else:
+                    raise ValueError(f'Unknown event type: {type(event)}')
+
+            # add regular message
+            if message:
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there shouldn't be two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)
+
+        # Add caching to the last 2 user messages
+        # if self.llm.is_caching_prompt_active():
+        #    user_turns_processed = 0
+        #    for message in reversed(messages):
+        #        if message.role == 'user' and user_turns_processed < 2:
+        #            message.content[
+        #                -1
+        #            ].cache_prompt = True  # Last item inside the message content
+        #            user_turns_processed += 1
+
+        # The latest user message is important:
+        # we want to remind the agent of the environment constraints
+        latest_user_message = next(
+            islice(
+                (
+                    m
+                    for m in reversed(messages)
+                    if m.role == 'user'
+                    and any(isinstance(c, TextContent) for c in m.content)
+                ),
+                1,
+            ),
+            None,
+        )
+
+        # set the last 4 messages to be non-condensable
+        # TODO make this configurable for experimentation
+        for message in messages[-4:]:
+            message.condensable = False
+
+        # iterations reminder
+        if latest_user_message:
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+            latest_user_message.content.append(TextContent(text=reminder_text))
+
+        return messages
+
+    def summarize_messages(self, state: State) -> AgentSummarizeAction | None:
+        """Summarizes the earlier messages in the agent's memory to reduce token usage. Roughly uses memGPT's algorithm for in-place summarization."""
+        if len(state.history) <= 2:
+            return None  # ignore
+
+        # summarize the conversation history using the condenser
+        condenser = MemoryCondenser(self.llm, self.prompt_manager)
+
+        # send all messages and let it sort it out
+        messages = self._get_messages(state)
+        summary_action = condenser.condense(messages)
+
+        # update conversation memory with the summary
+        if summary_action and summary_action.summary:
+            self.conversation_memory.update_summary(
+                summary_action.summary, summary_action.end_id
+            )
+
+        return summary_action
+
+    def recall_from_memory(self, query: str, top_k: int = 5) -> AgentRecallObservation:
+        """Searches the conversation memory for relevant information."""
+        # note: pairs are better than events for this
+        recalled_events = self.conversation_memory.search(self.llm, query, top_k)
+
+        # format the recalled events into a readable format
+        recalled_text = '\n'.join(
+            [f'- {event_to_memory(event, -1)}' for event in recalled_events]
+        )
+
+        return AgentRecallObservation(
+            content=f'Searching memory for: {query}', query=query, memory=recalled_text
+        )
--- a/openhands/agenthub/memcodeact_agent/micro/github.md
+++ b/openhands/agenthub/memcodeact_agent/micro/github.md
@@ -0,0 +1,69 @@
+---
+name: github
+agent: MemCodeActAgent
+require_env_var:
+    SANDBOX_ENV_GITHUB_TOKEN: "Create a GitHub Personal Access Token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and set it as SANDBOX_GITHUB_TOKEN in your environment variables."
+---
+
+# How to Interact with Github
+
+## Environment Variable Available
+
+- `GITHUB_TOKEN`: A read-only token for Github.
+
+## Using GitHub's RESTful API
+
+Use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API. Here are some common operations:
+
+Here's a template for API calls:
+
+```sh
+curl -H "Authorization: token $GITHUB_TOKEN" \
+    "https://api.github.com/{endpoint}"
+```
+
+First replace `{endpoint}` with the specific API path. Common operations:
+
+1. View an issue or pull request:
+   - Issues: `/repos/{owner}/{repo}/issues/{issue_number}`
+   - Pull requests: `/repos/{owner}/{repo}/pulls/{pull_request_number}`
+
+2. List repository issues or pull requests:
+   - Issues: `/repos/{owner}/{repo}/issues`
+   - Pull requests: `/repos/{owner}/{repo}/pulls`
+
+3. Search issues or pull requests:
+   - `/search/issues?q=repo:{owner}/{repo}+is:{type}+{search_term}+state:{state}`
+   - Replace `{type}` with `issue` or `pr`
+
+4. List repository branches:
+   `/repos/{owner}/{repo}/branches`
+
+5. Get commit details:
+   `/repos/{owner}/{repo}/commits/{commit_sha}`
+
+6. Get repository details:
+   `/repos/{owner}/{repo}`
+
+7. Get user information:
+   `/user`
+
+8. Search repositories:
+   `/search/repositories?q={query}`
+
+9. Get rate limit status:
+   `/rate_limit`
+
+Replace `{owner}`, `{repo}`, `{commit_sha}`, `{issue_number}`, `{pull_request_number}`,
+`{search_term}`, `{state}`, and `{query}` with appropriate values.
+
+## Important Notes
+
+1. Always use the GitHub API for operations instead of a web browser.
+2. The `GITHUB_TOKEN` is read-only. Avoid operations that require write access.
+3. Git config (username and email) is pre-set. Do not modify.
+4. Edit and test code locally. Never push directly to remote.
+5. Verify correct branch before committing.
+6. Commit changes frequently.
+7. If the issue or task is ambiguous or lacks sufficient detail, always request clarification from the user before proceeding.
+8. You should avoid using command line tools like `sed` for file editing.
--- a/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
@@ -0,0 +1,58 @@
+{# Memory system components #}
+
+{% macro memory_system() %}
+Apart from acting on the environment, you can also act on your own memory. You have three memory systems:
+
+1. Core Memory: Persistent important information that you always need to access
+   - Use <core_memory_append>category: content</core_memory_append> to add information
+   - Categories:
+     * personality: Your capabilities and traits
+     * task_context: Everything important about your current task
+   - Always add critical information about your task to task_context
+
+2. Conversation Memory: Your recent interaction history
+   - Use <memory_recall>query</memory_recall> to search past conversations
+   - Use <memory_summarize> to create summaries when context gets too long
+
+3. System Memory: Your basic capabilities and constraints
+   - Always available through your system prompt
+
+{{ memory_control_flow() }}
+{{ memory_editing() }}
+
+RECALL MEMORY (CONVERSATION HISTORY):
+Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
+This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
+You can search your conversation history using the <memory_recall> action.
+
+SUMMARY AND RECALL EXAMPLE:
+---
+ENVIRONMENT REMINDER:
+Prior messages (51 of 100 total messages) have been hidden from view due to conversation memory constraints.
+The following is a summary of the first 51 messages:
+I read the documentation of the wombat library, about its functions like: wombat.array, wombat.zeros, wombat.ones, etc.
+
+<memory_recall> What does the wombat.zeros() function do? </memory_recall>
+
+MEMORY RECALL:
+It creates an array of zero-filled tuples with the given size.
+---
+{% endmacro %}
+
+{% macro memory_control_flow() %}
+CONTROL FLOW:
+Unlike a human, your brain is not continuously thinking, but is run in short bursts.
+Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
+This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
+Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
+Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
+However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+{% endmacro %}
+
+{% macro memory_editing() %}
+MEMORY EDITING:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to edit and recall your own long-term memory is a key part of what makes you a very helpful model.
+{% endmacro %}
--- a/openhands/agenthub/memcodeact_agent/prompts/components/system.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/components/system.j2
@@ -0,0 +1,44 @@
+{# Core system components #}
+
+{% macro system_prefix() %}
+You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
+
+{% if core_memory %}
+Core Memory:
+{{ core_memory }}
+{% endif %}
+
+You are able to act on the environment directly, and you can also manage your own memory.
+You can use a Python environment with <execute_ipython>, e.g.:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+You can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
+If a bash command returns exit code `-1`, this means the process is not yet finished.
+You must then send a second <execute_bash>. The second <execute_bash> can be empty
+(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
+or it can contain the text `ctrl+c` to interrupt the process.
+
+For commands that may run indefinitely, you should redirect the output to a file and run
+the command in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+If a command execution result says "Command timed out. Sending SIGINT to the process",
+you should retry running the command in the background.
+{% endmacro %}
+
+{% macro environment_rules() %}
+You act on the environment using actions like <execute_ipython>. When you receive data prefixed with 'ENV OBSERVATION', it is a non-interactive information from the environment, not from a human user.
+For example:
+"ENV OBSERVATION
+The system returned a value of 42."
+Do not respond to these messages. Use them to perform your task.
+{% endmacro %}
+
+{% macro pip_install_rules() %}
+You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+{% endmacro %}
+
+{% macro browsing_rules() %}
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+{% endmacro %}
--- a/openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2
@@ -0,0 +1,35 @@
+{% set MEMORY_PROMPT %}
+CONTROL FLOW:
+Unlike a human, your brain is not continuously thinking, but is run in short bursts.
+Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
+This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
+Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
+Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
+However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+
+MEMORY EDITING:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to edit and recall your own long-term memory is a key part of what makes you a very helpful model.
+
+RECALL MEMORY (CONVERSATION HISTORY):
+Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
+This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
+You can search your conversation history using the <memory_recall> action.
+
+SUMMARY AND RECALL EXAMPLE:
+---
+ENVIRONMENT REMINDER:
+Prior messages (51 of 100 total messages) have been hidden from view due to conversation memory constraints.
+The following is a summary of the first 51 messages:
+I read the documentation of the wombat library, about its functions like: wombat.array, wombat.zeros, wombat.ones, etc.
+
+<memory_recall> What does the wombat.zeros() function do? </memory_recall>
+
+MEMORY RECALL:
+It creates an array of zero-filled tuples with the given size.
+---
+{% endset %}
+
+{{ MEMORY_PROMPT }}
--- a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
@@ -0,0 +1,71 @@
+{% set SUMMARIZE_PROMPT %}
+You are an expert memory condenser for an AI agent (that agent is "me" from my perspective). Your sole purpose is to help me maintain critical information while managing my limited context window.
+
+CRITICAL CONTEXT:
+- My context window is dangerously close to overflowing
+- Without your help summarizing, I will lose important information permanently
+- I need you to preserve the most relevant information for my current task
+- Your summary will become part of my memory, influencing my future actions
+
+YOU MUST DO IT RIGHT NOW OR I WILL LOSE INFORMATION PERMANENTLY AND FAIL THE TASK.
+
+SUMMARIZATION PRINCIPLES:
+1. Preserve task-critical information:
+   - Current objective and progress
+   - Important user preferences or constraints
+   - Key findings or decisions made
+   - Unfinished tasks or promises
+
+2. Maintain continuity:
+   - Keep information about ongoing processes
+   - Remember important variables or state
+   - Preserve error contexts if any task failed
+
+3. Drop unnecessary details:
+   - Completed subtasks that don't affect current state
+   - Intermediate calculations or debug outputs
+   - Pleasantries and general conversation
+   - Redundant information
+
+FORMAT REQUIREMENTS:
+Return a JSON response that I can easily process:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "FIRST PERSON perspective of what I did and learned. Focus on decisions and knowledge gained.",
+        "summarized_observations": "THIRD PERSON factual record of system outputs and user responses"
+    }
+}
+
+EXAMPLE GOOD SUMMARY:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "I identified the user needs a script to process CSV files. I installed pandas and numpy. I encountered a UnicodeDecodeError with the first approach but resolved it by using utf-8 encoding.",
+        "summarized_observations": "The system successfully installed required packages. User provided a sample CSV with 1000 rows. First attempt to read file failed due to encoding issues."
+    }
+}
+
+EXAMPLE BAD SUMMARY (DO NOT DO THIS):
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "The AI assistant helped with CSV processing and fixed some errors",
+        "summarized_observations": "Things were installed and a file was processed"
+    }
+}
+
+IMPORTANT REMINDERS:
+- Write "summarized_actions" in FIRST PERSON (I/me) - this is MY memory
+- Write "summarized_observations" in THIRD PERSON - these are external events
+- Include specific technical details that might be needed later
+- Stay focused on information relevant to completing the current task
+- Preserve any error contexts that might affect future actions
+- Keep numbers, variable names, and technical parameters exactly as they appeared
+
+Now, carefully condense this conversation history while maintaining critical context:
+-------------------------------------
+{{ conversation_history }}
+-------------------------------------
+{% endset %}
+{{ SUMMARIZE_PROMPT }}
--- a/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
@@ -0,0 +1,34 @@
+{# Import components #}
+{% import "components/system.j2" as system %}
+{% import "components/memory.j2" as memory %}
+
+{# Compose the system prompt #}
+{{ system.system_prefix() }}
+{{ system.pip_install_rules() }}
+{{ system.environment_rules() }}
+{{ memory.memory_system() }}
+{{ system.browsing_rules() }}
+
+{# Agent capabilities documentation #}
+{% if agent_skills_docs %}
+Apart from the standard Python library, you can also use the following functions (already imported) in <execute_ipython> environment:
+{{ agent_skills_docs }}
+IMPORTANT:
+- `open_file` only returns the first 100 lines of the file by default! You MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- You shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If you would like to add the line '        print(x)', you must fully write the line out, with all leading spaces before the code!
+- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+- Any code issued should be less than 50 lines to avoid context being cut off!
+- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
+- For `edit_file_by_replace` NEVER provide empty parameters!
+- For `edit_file_by_replace` the file must be read fully before any replacements!
+{% endif %}
+
+{# System constraints #}
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <memory_recall>per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If you are finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash> whenever possible, or memory operations when necessary (<memory_recall>).
+You must use full file paths and the `pwd` command to prevent path-related errors.
+You should only address the user in response to an explicit message from the user, or to ask for more information.
+You MUST NOT push any changes to GitHub unless explicitly requested to do so.
--- a/openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2
@@ -0,0 +1,195 @@
+{% set DEFAULT_EXAMPLE %}
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+ENVIRONMENT OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+ENVIRONMENT OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+ENVIRONMENT OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+ENVIRONMENT OBSERVATION:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+{% endset %}
+Here is an example of how you can interact with the environment for task solving:
+{{ DEFAULT_EXAMPLE }}
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
+
+NOW, LET'S START!
--- a/openhands/agenthub/micro/agent.py
+++ b/openhands/agenthub/micro/agent.py
@@ -8,10 +8,10 @@ from openhands.core.config import AgentConfig
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.core.utils import json
 from openhands.events.action import Action
+from openhands.events.event import Event
 from openhands.events.serialization.action import action_from_dict
 from openhands.events.serialization.event import event_to_memory
 from openhands.llm.llm import LLM
-from openhands.memory.history import ShortTermHistory


 def parse_response(orig_response: str) -> Action:
@@ -32,16 +32,14 @@ class MicroAgent(Agent):
    prompt = ''
    agent_definition: dict = {}

-    def history_to_json(
-        self, history: ShortTermHistory, max_events: int = 20, **kwargs
-    ):
+    def history_to_json(self, history: list[Event], max_events: int = 20, **kwargs):
        """
        Serialize and simplify history to str format
        """
        processed_history = []
        event_count = 0

-        for event in history.get_events(reverse=True):
+        for event in reversed(history):
            if event_count >= max_events:
                break
            processed_history.append(
--- a/openhands/agenthub/planner_agent/agent.py
+++ b/openhands/agenthub/planner_agent/agent.py
@@ -20,7 +20,9 @@ class PlannerAgent(Agent):
        """Initialize the Planner Agent with an LLM

        Parameters:
-        - llm (LLM): The llm to be used by this agent
+        - llm: The llm to be used by this agent
+        - config: The agent config
+        - memory: The memory for this agent
        """
        super().__init__(llm, config)

--- a/openhands/agenthub/planner_agent/prompt.py
+++ b/openhands/agenthub/planner_agent/prompt.py
@@ -117,7 +117,7 @@ def get_hint(latest_action_id: str) -> str:

 def get_prompt_and_images(
    state: State, max_message_chars: int
-) -> tuple[str, list[str]]:
+) -> tuple[str, list[str] | None]:
    """Gets the prompt for the planner agent.

    Formatted with the most recent action-observation pairs, current task, and hint based on last action
@@ -136,7 +136,7 @@ def get_prompt_and_images(
    latest_action: Action = NullAction()

    # retrieve the latest HISTORY_SIZE events
-    for event_count, event in enumerate(state.history.get_events(reverse=True)):
+    for event_count, event in enumerate(reversed(state.history)):
        if event_count >= HISTORY_SIZE:
            break
        if latest_action == NullAction() and isinstance(event, Action):
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -5,6 +5,7 @@ if TYPE_CHECKING:
    from openhands.controller.state.state import State
    from openhands.core.config import AgentConfig
    from openhands.events.action import Action
+from openhands.core.config.llm_config import LLMConfig
 from openhands.core.exceptions import (
    AgentAlreadyRegisteredError,
    AgentNotRegisteredError,
@@ -19,7 +20,7 @@ class Agent(ABC):
    This abstract base class is an general interface for an agent dedicated to
    executing a specific instruction and allowing human interaction with the
    agent during execution.
-    It tracks the execution status and maintains a history of interactions.
+    It tracks the execution status and maintains a reference to the conversation memory.
    """

    _registry: dict[str, Type['Agent']] = {}
@@ -29,9 +30,11 @@ class Agent(ABC):
        self,
        llm: LLM,
        config: 'AgentConfig',
+        memory_config: LLMConfig | None = None,
    ):
        self.llm = llm
        self.config = config
+        self.memory_config = memory_config
        self._complete = False

    @property
@@ -55,7 +58,7 @@ class Agent(ABC):
        to prepare the agent for restarting the instruction or cleaning up before destruction.

        """
-        # TODO clear history
+        # self.memory.reset()
        self._complete = False

        if self.llm:
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -1,7 +1,7 @@
 import asyncio
 import copy
 import traceback
-from typing import Type
+from typing import ClassVar, Type

 import litellm

@@ -31,6 +31,7 @@ from openhands.events.action import (
    ModifyTaskAction,
    NullAction,
 )
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
 from openhands.events.event import Event
 from openhands.events.observation import (
    AgentDelegateObservation,
@@ -38,6 +39,7 @@ from openhands.events.observation import (
    CmdOutputObservation,
    ErrorObservation,
    FatalErrorObservation,
+    NullObservation,
    Observation,
 )
 from openhands.events.serialization.event import truncate_content
@@ -63,6 +65,13 @@ class AgentController:
    parent: 'AgentController | None' = None
    delegate: 'AgentController | None' = None
    _pending_action: Action | None = None
+    filter_out: ClassVar[tuple[type[Event], ...]] = (
+        NullAction,
+        NullObservation,
+        ChangeAgentStateAction,
+        AgentStateChangedObservation,
+        FatalErrorObservation,
+    )

    def __init__(
        self,
@@ -117,12 +126,41 @@ class AgentController:
        self._initial_max_iterations = max_iterations
        self._initial_max_budget_per_task = max_budget_per_task

+        # use long term memory
+        # self.long_term_memory = LongTermMemory(self.agent.llm.config, self.agent.config, self.event_stream)
+
        # stuck helper
        self._stuck_detector = StuckDetector(self.state)

    async def close(self):
-        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
+        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.
+
+        Note that it's fairly important that this closes properly, otherwise the state is incomplete."""
        await self.set_agent_state_to(AgentState.STOPPED)
+
+        # we made history, now is the time to rewrite it!
+        # the final state.history will be used by external scripts like evals, tests, etc.
+        # history will need to be complete WITH delegates events
+        # like the regular agent history, it does not include:
+        # - 'hidden' events, events with hidden=True
+        # - backend events (the default 'filtered out' types, types in self.filter_out)
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else self.event_stream.get_latest_event_id()
+        )
+        self.state.history = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+
+        # unsubscribe from the event stream
        self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)

    def update_state_before_step(self):
@@ -137,8 +175,9 @@ class AgentController:
        """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct.

        This method should be called for a particular type of errors, which have:
-        - a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
-        - an ErrorObservation that can be sent to the LLM by the user role, with the exception message, so it can self-correct next time.
+        - message: a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
+        - an ErrorObservation that can be sent to the LLM, with the exception message, so it can self-correct next time.
+        - exception: the underlying exception, which is used by evals and tests to check what error the agent encountered.
        """
        self.state.last_error = message
        if exception:
@@ -180,6 +219,11 @@ class AgentController:
        """
        if hasattr(event, 'hidden') and event.hidden:
            return
+
+        # if the event is not filtered out, add it to the history
+        if not any(isinstance(event, filter_type) for filter_type in self.filter_out):
+            self.state.history.append(event)
+
        if isinstance(event, Action):
            await self._handle_action(event)
        elif isinstance(event, Observation):
@@ -211,6 +255,13 @@ class AgentController:
            self.state.outputs = action.outputs
            self.state.metrics.merge(self.state.local_metrics)
            await self.set_agent_state_to(AgentState.REJECTED)
+        elif isinstance(action, AgentSummarizeAction):
+            self.state.summary = action
+        elif isinstance(action, AgentRecallAction):
+            # llama_index_list = self.long_term_memory.search(action.query, action.history)
+            # logger.info(f'llama-index list: {llama_index_list}')
+            litellm_list = self.agent.llm.search(action.query, self.state.history)
+            logger.info(f'litellm list: {litellm_list}')

    async def _handle_observation(self, observation: Observation):
        """Handles observation from the event stream.
@@ -239,17 +290,23 @@ class AgentController:
            self.agent.llm.metrics.merge(observation.llm_metrics)

        if self._pending_action and self._pending_action.id == observation.cause:
+            # FIXME we may want each of these with the other's context
+            # self.long_term_memory.add_event(self._pending_action)
+            # self.long_term_memory.add_event(observation)
+
+            # the runtime has handled the action, so we can clear it
            self._pending_action = None
-            if self.state.agent_state == AgentState.USER_CONFIRMED:
-                await self.set_agent_state_to(AgentState.RUNNING)
-            if self.state.agent_state == AgentState.USER_REJECTED:
-                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
-            return
+
+            # set the right state when the user confirms or rejects, if we're otherwise good to go (not an error)
+            if not isinstance(observation, ErrorObservation):
+                if self.state.agent_state == AgentState.USER_CONFIRMED:
+                    await self.set_agent_state_to(AgentState.RUNNING)
+                elif self.state.agent_state == AgentState.USER_REJECTED:
+                    await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+                return

        if isinstance(observation, CmdOutputObservation):
            return
-        elif isinstance(observation, AgentDelegateObservation):
-            self.state.history.on_event(observation)
        elif isinstance(observation, ErrorObservation):
            if self.state.agent_state == AgentState.ERROR:
                self.state.metrics.merge(self.state.local_metrics)
@@ -275,10 +332,15 @@ class AgentController:
        elif action.source == EventSource.AGENT and action.wait_for_response:
            await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)

+        # add to long term memory
+        # self.long_term_memory.add_event(action)
+
    def reset_task(self):
        """Resets the agent's task."""

        self.almost_stuck = 0
+
+        # FIXME: wipe out the memory
        self.agent.reset()

    async def set_agent_state_to(self, new_state: AgentState):
@@ -365,6 +427,7 @@ class AgentController:
        Args:
            action (AgentDelegateAction): The action containing information about the delegate agent to start.
        """
+        # prepare the required arguments for the delegate agent: llm, agent_config, memory
        agent_cls: Type[Agent] = Agent.get_cls(action.agent)
        agent_config = self.agent_configs.get(action.agent, self.agent.config)
        llm_config = self.agent_to_llm_config.get(action.agent, self.agent.llm.config)
@@ -378,6 +441,8 @@ class AgentController:
            delegate_level=self.state.delegate_level + 1,
            # global metrics should be shared between parent and child
            metrics=self.state.metrics,
+            # start on top of the stream
+            start_id=self.event_stream.get_latest_event_id() + 1,
        )
        logger.info(
            f'[Agent Controller {self.id}]: start delegate, creating agent {delegate_agent.name} using LLM {llm}'
@@ -403,9 +468,6 @@ class AgentController:
            return

        if self._pending_action:
-            logger.debug(
-                f'{self.agent.name} LEVEL {self.state.delegate_level} LOCAL STEP {self.state.local_iteration} GLOBAL STEP {self.state.iteration} awaiting pending action to get executed: {self._pending_action}'
-            )
            await asyncio.sleep(1)
            return

@@ -480,9 +542,7 @@ class AgentController:

    async def _delegate_step(self):
        """Executes a single step of the delegate agent."""
-        logger.debug(f'[Agent Controller {self.id}] Delegate not none, awaiting...')
        await self.delegate._step()  # type: ignore[union-attr]
-        logger.debug(f'[Agent Controller {self.id}] Delegate step done')
        assert self.delegate is not None
        delegate_state = self.delegate.get_agent_state()
        logger.debug(f'[Agent Controller {self.id}] Delegate state: {delegate_state}')
@@ -490,12 +550,21 @@ class AgentController:
            # update iteration that shall be shared across agents
            self.state.iteration = self.delegate.state.iteration

+            # emit AgentDelegateObservation to mark delegate termination due to error
+            delegate_outputs = (
+                self.delegate.state.outputs if self.delegate.state else {}
+            )
+            content = (
+                f'{self.delegate.agent.name} encountered an error during execution.'
+            )
+            obs = AgentDelegateObservation(outputs=delegate_outputs, content=content)
+            self.event_stream.add_event(obs, EventSource.AGENT)
+
            # close the delegate upon error
            await self.delegate.close()
            self.delegate = None
            self.delegateAction = None

-            await self.report_error('Delegator agent encountered an error')
        elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
            logger.info(
                f'[Agent Controller {self.id}] Delegate agent has finished execution'
@@ -517,9 +586,7 @@ class AgentController:
            content = (
                f'{self.delegate.agent.name} finishes task with {formatted_output}'
            )
-            obs: Observation = AgentDelegateObservation(
-                outputs=outputs, content=content
-            )
+            obs = AgentDelegateObservation(outputs=outputs, content=content)

            # clean up delegate status
            self.delegate = None
@@ -583,8 +650,10 @@ class AgentController:
            max_iterations: The maximum number of iterations allowed for the task.
            confirmation_mode: Whether to enable confirmation mode.
        """
-        # state from the previous session, state from a parent agent, or a new state
-        # note that this is called twice when restoring a previous session, first with state=None
+        # state can come from:
+        # - the previous session, in which case it has history
+        # - from a parent agent, in which case it has no history
+        # - None / a new state
        if state is None:
            self.state = State(
                inputs={},
@@ -594,25 +663,103 @@ class AgentController:
        else:
            self.state = state

-        # when restored from a previous session, the State object will have history, start_id, and end_id
-        # connect it to the event stream
-        self.state.history.set_event_stream(self.event_stream)
+            if self.state.start_id <= -1:
+                self.state.start_id = 0

-        # if start_id was not set in State, we're starting fresh, at the top of the stream
-        start_id = self.state.start_id
-        if start_id == -1:
-            start_id = self.event_stream.get_latest_event_id() + 1
+            logger.debug(
+                f'AgentController {self.id} initializing history from event {self.state.start_id}'
+            )
+
+            self._init_history()
+
+    def _init_history(self):
+        """Initializes the agent's history from the event stream.
+
+        The history is a list of events that:
+        - Excludes events of types listed in self.filter_out
+        - Excludes events with hidden=True attribute
+        - For delegate events (between AgentDelegateAction and AgentDelegateObservation):
+            - Excludes all events between the action and observation
+            - Includes the delegate action and observation themselves
+        """
+
+        # define range of events to fetch
+        # delegates start with a start_id and initially won't find any events
+        # otherwise we're restoring a previous session
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else self.event_stream.get_latest_event_id()
+        )
+
+        # sanity check
+        if start_id > end_id + 1:
+            logger.debug(
+                f'start_id {start_id} is greater than end_id + 1 ({end_id + 1}). History will be empty.'
+            )
+            self.state.history = []
+            return
+
+        # Get all events, filtering out backend events and hidden events
+        events = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+
+        # Find all delegate action/observation pairs
+        delegate_ranges: list[tuple[int, int]] = []
+        delegate_action_ids: list[int] = []  # stack of unmatched delegate action IDs
+
+        for event in events:
+            if isinstance(event, AgentDelegateAction):
+                delegate_action_ids.append(event.id)
+                # Note: we can get agent=event.agent and task=event.inputs.get('task','')
+                # if we need to track these in the future
+
+            elif isinstance(event, AgentDelegateObservation):
+                # Match with most recent unmatched delegate action
+                if not delegate_action_ids:
+                    logger.error(
+                        f'Found AgentDelegateObservation without matching action at id={event.id}'
+                    )
+                    continue
+
+                action_id = delegate_action_ids.pop()
+                delegate_ranges.append((action_id, event.id))
+
+        # Filter out events between delegate action/observation pairs
+        if delegate_ranges:
+            filtered_events: list[Event] = []
+            current_idx = 0
+
+            for start_id, end_id in sorted(delegate_ranges):
+                # Add events before delegate range
+                filtered_events.extend(
+                    event for event in events[current_idx:] if event.id < start_id
+                )
+
+                # Add delegate action and observation
+                filtered_events.extend(
+                    event for event in events if event.id in (start_id, end_id)
+                )
+
+                # Update index to after delegate range
+                current_idx = next(
+                    (i for i, e in enumerate(events) if e.id > end_id), len(events)
+                )
+
+            # Add any remaining events after last delegate range
+            filtered_events.extend(events[current_idx:])
+
+            self.state.history = filtered_events
        else:
-            logger.debug(f'AgentController {self.id} restoring from event {start_id}')
-
-        # make sure history is in sync
-        self.state.start_id = start_id
-        self.state.history.start_id = start_id
-
-        # if there was an end_id saved in State, set it in history
-        # currently not used, later useful for delegates
-        if self.state.end_id > -1:
-            self.state.history.end_id = self.state.end_id
+            self.state.history = events

    def _is_stuck(self):
        """Checks if the agent or its delegate is stuck in a loop.
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -10,9 +10,10 @@ from openhands.core.schema import AgentState
 from openhands.events.action import (
    MessageAction,
 )
-from openhands.events.action.agent import AgentFinishAction
+from openhands.events.action.agent import AgentFinishAction, AgentSummarizeAction
+from openhands.events.event import Event, EventSource
+from openhands.events.observation import AgentDelegateObservation
 from openhands.llm.metrics import Metrics
-from openhands.memory.history import ShortTermHistory
 from openhands.storage.files import FileStore


@@ -77,7 +78,7 @@ class State:
    # max number of iterations for the current task
    max_iterations: int = 100
    confirmation_mode: bool = False
-    history: ShortTermHistory = field(default_factory=ShortTermHistory)
+    history: list[Event] = field(default_factory=list)
    inputs: dict = field(default_factory=dict)
    outputs: dict = field(default_factory=dict)
    last_error: str | None = None
@@ -94,6 +95,8 @@ class State:
    start_id: int = -1
    end_id: int = -1
    almost_stuck: int = 0
+    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
+    summary: AgentSummarizeAction | None = None
    # NOTE: This will never be used by the controller, but it can be used by different
    # evaluation tasks to store extra data needed to track the progress/state of the task.
    extra_data: dict[str, Any] = field(default_factory=dict)
@@ -132,41 +135,46 @@ class State:
        return state

    def __getstate__(self):
+        # don't pickle history, it will be restored from the event stream
        state = self.__dict__.copy()
-
-        # save the relevant data from recent history
-        # so that we can restore it when the state is restored
-        if 'history' in state:
-            state['start_id'] = state['history'].start_id
-            state['end_id'] = state['history'].end_id
-
-        # don't save history object itself
-        state.pop('history', None)
+        state['history'] = []
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)

-        # recreate the history object
+        # make sure we always have the attribute history
        if not hasattr(self, 'history'):
-            self.history = ShortTermHistory()
+            self.history = []

-        # restore the relevant data in history from the state
-        self.history.start_id = self.start_id
-        self.history.end_id = self.end_id
-
-        # remove the restored data from the state if any
-
-    def get_current_user_intent(self):
+    def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
        """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
        last_user_message = None
        last_user_message_image_urls: list[str] | None = []
-        for event in self.history.get_events(reverse=True):
+        for event in reversed(self.history):
            if isinstance(event, MessageAction) and event.source == 'user':
                last_user_message = event.content
                last_user_message_image_urls = event.images_urls
            elif isinstance(event, AgentFinishAction):
                if last_user_message is not None:
-                    return last_user_message
+                    return last_user_message, None

        return last_user_message, last_user_message_image_urls
+
+    def has_delegation(self) -> bool:
+        for event in self.history:
+            if isinstance(event, AgentDelegateObservation):
+                return True
+        return False
+
+    def get_last_agent_message(self) -> str | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                return event.content
+        return None
+
+    def get_last_user_message(self) -> str | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                return event.content
+        return None
--- a/openhands/controller/stuck.py
+++ b/openhands/controller/stuck.py
@@ -28,7 +28,7 @@ class StuckDetector:
        # filter out MessageAction with source='user' from history
        filtered_history = [
            event
-            for event in self.state.history.get_events()
+            for event in self.state.history
            if not (
                (isinstance(event, MessageAction) and event.source == EventSource.USER)
                or
--- a/openhands/core/config/app_config.py
+++ b/openhands/core/config/app_config.py
@@ -40,7 +40,6 @@ class AppConfig:
        e2b_api_key: The E2B API key.
        disable_color: Whether to disable color. For terminals that don't support color.
        debug: Whether to enable debugging.
-        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
@@ -72,7 +71,6 @@ class AppConfig:
    disable_color: bool = False
    jwt_secret: str = uuid.uuid4().hex
    debug: bool = False
-    enable_cli_session: bool = False
    file_uploads_max_file_size_mb: int = 0
    file_uploads_restrict_file_types: bool = False
    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -42,6 +42,10 @@ class LLMConfig:
        log_completions: Whether to log LLM completions to the state.
        log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
        draft_editor: A more efficient LLM to use for file editing. Introduced in [PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985).
+        max_conversation_window: The maximum number of messages to include in the conversation window (context), after which old messages are truncated or summarized.
+        conversation_top_k: The number of top results to retrieve from the conversation history.
+        message_summary_warning_level: The fraction of the conversation window for warning about context overflow (e.g. 0.75 for 75% of the tokens).
+        custom_tokenizer: tokenizer to use for computing token size. Not necessary for Open AI, Anthropic. LiteLLM will check HuggingFace for this (e.g. 'deepseek-ai/deepseek-V2.5')
    """

    model: str = 'gpt-4o'
@@ -76,6 +80,10 @@ class LLMConfig:
    log_completions: bool = False
    log_completions_folder: str | None = None
    draft_editor: Optional['LLMConfig'] = None
+    max_conversation_window: int = 10
+    conversation_top_k: int = 5
+    message_summary_warning_level: float = 0.75
+    custom_tokenizer: str | None = None

    def defaults_to_dict(self) -> dict:
        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -136,15 +136,30 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                    logger.openhands_logger.debug(
                        'Attempt to load default LLM config from config toml'
                    )
-                    llm_config = LLMConfig.from_dict(value)
-                    cfg.set_llm_config(llm_config, 'llm')
+                    # Extract generic LLM fields
+                    generic_llm_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    generic_llm_config = LLMConfig.from_dict(generic_llm_fields)
+                    cfg.set_llm_config(generic_llm_config, 'llm')
+
+                    # Process custom named LLM configs
                    for nested_key, nested_value in value.items():
                        if isinstance(nested_value, dict):
                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as llm config'
+                                f'Attempt to load group {nested_key} from config toml as LLM config'
                            )
-                            llm_config = LLMConfig.from_dict(nested_value)
-                            cfg.set_llm_config(llm_config, nested_key)
+                            # Apply generic LLM config with custom LLM overrides, e.g.
+                            # [llm]
+                            # model="..."
+                            # num_retries = 5
+                            # [llm.claude]
+                            # model="claude-3-5-sonnet"
+                            # results in num_retries APPLIED to claude-3-5-sonnet
+                            merged_llm_dict = generic_llm_config.__dict__.copy()
+                            merged_llm_dict.update(nested_value)
+                            custom_llm_config = LLMConfig.from_dict(merged_llm_dict)
+                            cfg.set_llm_config(custom_llm_config, nested_key)
                elif not key.startswith('sandbox') and key.lower() != 'core':
                    logger.openhands_logger.warning(
                        f'Unknown key in {toml_file}: "{key}"'
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -94,3 +94,20 @@ class CloudFlareBlockageError(Exception):
    """Exception raised when a request is blocked by CloudFlare."""

    pass
+
+
+class SummarizeError(Exception):
+    """Exception raised when message can't be summarized."""
+
+    def __init__(self, message='Error summarizing the memory'):
+        super().__init__(message)
+
+
+class InvalidSummaryResponseError(Exception):
+    def __init__(self, message='Invalid summary response'):
+        super().__init__(message)
+
+
+class TokenLimitExceededError(Exception):
+    def __init__(self, message='Token limit exceeded'):
+        super().__init__(message)
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -125,16 +125,18 @@ async def run_controller(
        await runtime.connect()

    event_stream = runtime.event_stream
-    # restore cli session if enabled
+
+    # restore cli session if available
    initial_state = None
-    if config.enable_cli_session:
-        try:
-            logger.info(f'Restoring agent state from cli session {event_stream.sid}')
-            initial_state = State.restore_from_session(
-                event_stream.sid, event_stream.file_store
-            )
-        except Exception as e:
-            logger.info(f'Error restoring state: {e}')
+    try:
+        logger.debug(
+            f'Trying to restore agent state from cli session {event_stream.sid} if available'
+        )
+        initial_state = State.restore_from_session(
+            event_stream.sid, event_stream.file_store
+        )
+    except Exception as e:
+        logger.debug(f'Cannot restore agent state: {e}')

    # init controller with this initial state
    controller = AgentController(
@@ -160,7 +162,7 @@ async def run_controller(
    )

    # start event is a MessageAction with the task, either resumed or new
-    if config.enable_cli_session and initial_state is not None:
+    if initial_state is not None:
        # we're resuming the previous session
        event_stream.add_event(
            MessageAction(
@@ -171,7 +173,7 @@ async def run_controller(
            ),
            EventSource.USER,
        )
-    elif initial_state is None:
+    else:
        # init with the provided actions
        event_stream.add_event(initial_user_action, EventSource.USER)

@@ -198,8 +200,9 @@ async def run_controller(
        await asyncio.sleep(1)  # Give back control for a tick, so the agent can run

    # save session when we're about to close
-    if config.enable_cli_session:
+    if config.file_store is not None and config.file_store != 'memory':
        end_state = controller.get_state()
+        # NOTE: the saved state does not include delegates events
        end_state.save_to_session(event_stream.sid, event_stream.file_store)

    # close when done
@@ -210,10 +213,7 @@ async def run_controller(
    if config.trajectories_path is not None:
        file_path = os.path.join(config.trajectories_path, sid + '.json')
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
-        histories = [
-            event_to_trajectory(event)
-            for event in state.history.get_events(include_delegates=True)
-        ]
+        histories = [event_to_trajectory(event) for event in state.history]
        with open(file_path, 'w') as f:
            json.dump(histories, f)

--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -52,6 +52,8 @@ class Message(BaseModel):
    content: list[TextContent | ImageContent] = Field(default=list)
    cache_enabled: bool = False
    vision_enabled: bool = False
+    condensable: bool = True
+    event_id: int = -1

    @property
    def contains_image(self) -> bool:
--- a/openhands/core/schema/action.py
+++ b/openhands/core/schema/action.py
@@ -86,5 +86,8 @@ class ActionTypeSchema(BaseModel):
    SEND_PR: str = Field(default='send_pr')
    """Send a PR to github."""

+    RECALL: str = Field(default='recall')
+    """Recalls the memory."""
+

 ActionType = ActionTypeSchema()
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -44,5 +44,7 @@ class ObservationTypeSchema(BaseModel):

    USER_REJECTED: str = Field(default='user_rejected')

+    RECALL: str = Field(default='recall')
+

 ObservationType = ObservationTypeSchema()
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -20,8 +20,12 @@ class ChangeAgentStateAction(Action):

@dataclass
 class AgentSummarizeAction(Action):
-    summary: str
+    summary: str = ''  # summary to be inserted as a memory block
    action: str = ActionType.SUMMARIZE
+    start_id: int = -1
+    end_id: int = -1
+    summarized_actions: str = ''
+    summarized_observations: str = ''

    @property
    def message(self) -> str:
@@ -78,3 +82,14 @@ class AgentDelegateAction(Action):
    @property
    def message(self) -> str:
        return f"I'm asking {self.agent} for help with this task."
+
+
+@dataclass
+class AgentRecallAction(Action):
+    query: str
+    thought: str = ''
+    action: str = ActionType.RECALL
+
+    @property
+    def message(self) -> str:
+        return f'Recalling "{self.query[:10]}..."'
--- a/openhands/events/action/message.py
+++ b/openhands/events/action/message.py
@@ -7,7 +7,7 @@ from openhands.events.action.action import Action, ActionSecurityRisk
@dataclass
 class MessageAction(Action):
    content: str
-    images_urls: list | None = None
+    images_urls: list[str] | None = None
    wait_for_response: bool = False
    action: str = ActionType.MESSAGE
    security_risk: ActionSecurityRisk | None = None
--- a/openhands/events/observation/agent.py
+++ b/openhands/events/observation/agent.py
@@ -14,3 +14,14 @@ class AgentStateChangedObservation(Observation):
    @property
    def message(self) -> str:
        return ''
+
+
+@dataclass
+class AgentRecallObservation(Observation):
+    query: str
+    memory: str
+    observation: str = ObservationType.RECALL
+
+    @property
+    def message(self) -> str:
+        return f'Memory:\n{self.memory}'
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -3,7 +3,9 @@ from openhands.events.action.action import Action
 from openhands.events.action.agent import (
    AgentDelegateAction,
    AgentFinishAction,
+    AgentRecallAction,
    AgentRejectAction,
+    AgentSummarizeAction,
    ChangeAgentStateAction,
 )
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -36,6 +38,8 @@ actions = (
    ModifyTaskAction,
    ChangeAgentStateAction,
    MessageAction,
+    AgentSummarizeAction,
+    AgentRecallAction,
 )

 ACTION_TYPE_TO_CLASS = {action_class.action: action_class for action_class in actions}  # type: ignore[attr-defined]
--- a/openhands/events/stream.py
+++ b/openhands/events/stream.py
@@ -20,6 +20,7 @@ class EventStreamSubscriber(str, Enum):
    RUNTIME = 'runtime'
    MAIN = 'main'
    TEST = 'test'
+    MEMORY = 'memory'


 def session_exists(sid: str, file_store: FileStore) -> bool:
@@ -67,11 +68,26 @@ class EventStream:

    def get_events(
        self,
-        start_id=0,
-        end_id=None,
-        reverse=False,
+        start_id: int = 0,
+        end_id: int | None = None,
+        reverse: bool = False,
        filter_out_type: tuple[type[Event], ...] | None = None,
+        filter_hidden: bool = False,
    ) -> Iterable[Event]:
+        """
+        Retrieve events from the event stream, optionally filtering out events of a given type
+        and events marked as hidden.
+
+        Args:
+            start_id: The ID of the first event to retrieve. Defaults to 0.
+            end_id: The ID of the last event to retrieve. Defaults to the last event in the stream.
+            reverse: Whether to retrieve events in reverse order. Defaults to False.
+            filter_out_type: A tuple of event types to filter out. Typically used to filter out backend events from the agent.
+            filter_hidden: If True, filters out events with the 'hidden' attribute set to True.
+
+        Yields:
+            Events from the stream that match the criteria.
+        """
        if reverse:
            if end_id is None:
                end_id = self._cur_id - 1
@@ -79,9 +95,11 @@ class EventStream:
            while event_id >= start_id:
                try:
                    event = self.get_event(event_id)
-                    if filter_out_type is None or not isinstance(
-                        event, filter_out_type
-                    ):
+                    # apply type and 'hidden' filters
+                    if (
+                        filter_out_type is None
+                        or not isinstance(event, filter_out_type)
+                    ) and (not filter_hidden or not getattr(event, 'hidden', False)):
                        yield event
                except FileNotFoundError:
                    logger.debug(f'No event found for ID {event_id}')
@@ -93,9 +111,11 @@ class EventStream:
                    break
                try:
                    event = self.get_event(event_id)
-                    if filter_out_type is None or not isinstance(
-                        event, filter_out_type
-                    ):
+                    # apply type and 'hidden' filters
+                    if (
+                        filter_out_type is None
+                        or not isinstance(event, filter_out_type)
+                    ) and (not filter_hidden or not getattr(event, 'hidden', False)):
                        yield event
                except FileNotFoundError:
                    break
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -7,13 +7,18 @@ from functools import partial
 from typing import Any

 from openhands.core.config import LLMConfig
+from openhands.core.exceptions import TokenLimitExceededError
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_memory

 with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    import litellm
+import numpy as np
 from litellm import ModelInfo, PromptTokensDetails
 from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
+from litellm import embedding as litellm_embedding
 from litellm.exceptions import (
    APIConnectionError,
    APIError,
@@ -22,6 +27,7 @@ from litellm.exceptions import (
    ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
+from litellm.utils import create_pretrained_tokenizer

 from openhands.core.exceptions import CloudFlareBlockageError
 from openhands.core.logger import openhands_logger as logger
@@ -126,6 +132,13 @@ class LLM(RetryMixin, DebugMixin):
                ):
                    self.config.max_output_tokens = self.model_info['max_tokens']

+        # if using a custom tokenizer, make sure it's loaded and accessible in the format expected by litellm
+        if self.config.custom_tokenizer is not None:
+            self.tokenizer = create_pretrained_tokenizer(self.config.custom_tokenizer)
+        else:
+            self.tokenizer = None
+
+        # set up the completion function
        self._completion = partial(
            litellm_completion,
            model=self.config.model,
@@ -185,6 +198,14 @@ class LLM(RetryMixin, DebugMixin):
            # log the entire LLM prompt
            self.log_prompt(messages)

+            # find out if we have too many tokens
+            token_count = self.get_token_count(messages)
+            max_input_tokens = self.config.max_input_tokens
+            if token_count > max_input_tokens:
+                raise TokenLimitExceededError(
+                    f'Token limit exceeded: {token_count} > {max_input_tokens}'
+                )
+
            if self.is_caching_prompt_active():
                # Anthropic-specific prompt caching
                if 'claude-3' in self.config.model:
@@ -339,15 +360,32 @@ class LLM(RetryMixin, DebugMixin):
        """Get the number of tokens in a list of messages.

        Args:
-            messages (list): A list of messages.
+            messages (list): A list of messages, either as a list of dicts or as a list of Message objects.

        Returns:
            int: The number of tokens.
        """
+        # convert Message objects to dicts, litellm expects dicts
+        if (
+            isinstance(messages, list)
+            and len(messages) > 0
+            and isinstance(messages[0], Message)
+        ):
+            messages = self.format_messages_for_llm(messages)
+
+        # try to get the token count with the default litellm tokenizers
+        # or the custom tokenizer attribute if set for this LLM configuration
        try:
-            return litellm.token_counter(model=self.config.model, messages=messages)
-        except Exception:
+            return litellm.token_counter(
+                model=self.config.model,
+                messages=messages,
+                custom_tokenizer=self.tokenizer,
+            )
+        except Exception as e:
            # TODO: this is to limit logspam in case token count is not supported
+            logger.error(
+                f'Error getting token count for\n model {self.config.model}\ncustom_tokenizer: {self.config.custom_tokenizer}\n{e}'
+            )
            return 0

    def _is_local(self):
@@ -426,3 +464,87 @@ class LLM(RetryMixin, DebugMixin):

        # let pydantic handle the serialization
        return [message.model_dump() for message in messages]
+
+    def embed_event(self, event: Event) -> np.ndarray:
+        """
+        Embeds a single event using the embedding model.
+
+        Args:
+            event (Event): The event to embed.
+
+        Returns:
+            np.ndarray: The embedding vector of the event.
+        """
+        # Convert the event to a string representation
+        event_str = event_to_memory(event, -1)
+        # Get the embedding
+        embedding_response = litellm_embedding(
+            model=self.config.embedding_model,
+            input=event_str,
+            custom_llm_provider=self.config.custom_llm_provider,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            input_cost_per_token=self.config.input_cost_per_token,
+            output_cost_per_token=self.config.output_cost_per_token,
+        )
+        embedding = embedding_response['data'][0]['embedding']
+        return np.array(embedding)
+
+    def embed_history(self, history: list[Event]) -> list[np.ndarray]:
+        """
+        Embeds a list of events.
+
+        Args:
+            history (list[Event]): The list of events to embed.
+
+        Returns:
+            list[np.ndarray]: A list of embedding vectors.
+        """
+        embeddings = []
+        for event in history:
+            embedding = self.embed_event(event)
+            embeddings.append(embedding)
+        return embeddings
+
+    def search(self, query: str, history: list[Event], top_k: int = 5) -> list[Event]:
+        """
+        Recalls the most similar events based on the query.
+
+        Args:
+            query (str): The query string.
+            embeddings (list[np.ndarray]): The list of embedded vectors.
+            history (list[Event]): The corresponding list of events.
+            top_k (int, optional): The number of top similar events to retrieve. Defaults to 5.
+
+        Returns:
+            list[Event]: The list of recalled events.
+        """
+
+        # make sure history has been embedded
+        embeddings = self.embed_history(history)
+
+        # Embed the query
+        query_embedding_response = litellm_embedding(
+            model=self.config.embedding_model,
+            input=query,
+            custom_llm_provider=self.config.custom_llm_provider,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            input_cost_per_token=self.config.input_cost_per_token,
+            output_cost_per_token=self.config.output_cost_per_token,
+        )
+        query_embedding = np.array(
+            query_embedding_response['data'][0]['embedding']
+        ).reshape(1, -1)
+
+        # Compute cosine similarity
+        similarity_scores = np.dot(query_embedding, embeddings)
+
+        # Get the top_k indices
+        top_indices = similarity_scores.argsort()[-top_k:][::-1]
+
+        # Retrieve the corresponding events
+        recalled_events = [history[i] for i in top_indices]
+        return recalled_events
--- a/openhands/memory/init.py
+++ b/openhands/memory/init.py
@@ -1,5 +1,3 @@
-from openhands.memory.condenser import MemoryCondenser
-from openhands.memory.history import ShortTermHistory
 from openhands.memory.memory import LongTermMemory
-
-__all__ = ['LongTermMemory', 'ShortTermHistory', 'MemoryCondenser']
+from openhands.memory.utils import parse_summary_response
+__all__ = ['LongTermMemory', 'parse_summary_response']
--- a/openhands/memory/base_memory.py
+++ b/openhands/memory/base_memory.py
@@ -0,0 +1,26 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class Memory(ABC):
+    """Abstract base class for all memory modules."""
+
+    @abstractmethod
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the memory module to a dictionary of individual memories."""
+        pass
+
+    # @abstractmethod
+    # def from_dict(self, data: dict[str, Any]) -> None:
+    #    """Load the memory module from a dictionary of individual memories."""
+    #    pass
+
+    @abstractmethod
+    def __str__(self) -> str:
+        """String representation of the memory module."""
+        pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        """Reset the memory module."""
+        pass
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,24 +1,134 @@
+from litellm.types.utils import ModelResponse
+
+from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
+from openhands.events.action import AgentSummarizeAction
 from openhands.llm.llm import LLM
+from openhands.memory.utils import parse_summary_response
+from openhands.utils.prompt import PromptManager


 class MemoryCondenser:
-    def condense(self, summarize_prompt: str, llm: LLM):
-        """Attempts to condense the memory by using the llm
+    def __init__(self, llm: LLM, prompt_manager: PromptManager):
+        self.llm = llm
+        self.prompt_manager = prompt_manager

-        Parameters:
-        - llm (LLM): llm to be used for summarization
+        # just easier to read
+        self.context_window = llm.config.max_input_tokens

-        Raises:
-        - Exception: the same exception as it got from the llm or processing the response
+    def condense(
+        self,
+        messages: list[Message],
+    ) -> AgentSummarizeAction:
        """
-        try:
-            messages = [{'content': summarize_prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
-            summary_response = resp['choices'][0]['message']['content']
-            return summary_response
-        except Exception as e:
-            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
+        Condenses a list of messages using the LLM and returns a summary action.

-            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the memory chunk by chunk
-            raise
+        Args:
+            messages (list[Message]): The list of messages to condense.
+
+        Returns:
+            AgentSummarizeAction: The summary action containing the condensed summary.
+        """
+        assert (
+            self.context_window is not None and self.context_window > 2000
+        ), 'context window must be a number over 2000'
+
+        # don't condense if under the token limit
+        total_token_count = self.llm.get_token_count(messages)
+        if total_token_count < self.context_window:
+            logger.debug(
+                f'Not condensing messages because token count ({total_token_count}) is less than max input tokens ({self.context_window})'
+            )
+            return AgentSummarizeAction(end_id=-1)
+
+        # calculate safe token limit for processing (e.g. 80% of context window)
+        safe_token_limit = int(
+            self.context_window * self.llm.config.message_summary_warning_level
+        )
+
+        # collect condensable messages with their IDs and token counts
+        condensable_messages: list[tuple[Message, int]] = [
+            (msg, self.llm.get_token_count([msg.model_dump()]))
+            for msg in messages
+            if msg.condensable
+        ]
+
+        if len(condensable_messages) <= 1:
+            # prevents potential infinite loop of summarizing the same message repeatedly
+            raise SummarizeError(
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(condensable_messages)} <= 1]"
+            )
+
+        # track the very first message's id - this will be our start_id
+        first_message_id = condensable_messages[0][0].event_id
+
+        # create chunks that fit within safe_token_limit
+        chunks: list[list[Message]] = []
+        current_chunk: list[Message] = []
+        current_chunk_tokens = 0
+
+        for msg, token_count in condensable_messages:
+            if current_chunk_tokens + token_count > safe_token_limit:
+                if current_chunk:  # save current chunk if not empty, it's done
+                    chunks.append(current_chunk)
+
+                # start a new chunk
+                current_chunk = [msg]
+                current_chunk_tokens = token_count
+            else:
+                # add to current chunk
+                current_chunk.append(msg)
+                current_chunk_tokens += token_count
+
+        # add the last chunk
+        if current_chunk:
+            chunks.append(current_chunk)
+
+        # process chunks
+        final_summary = None
+        # track the last real message id (note: not summary actions)
+        last_real_message_id = condensable_messages[-1][0].event_id
+
+        for i, chunk in enumerate(chunks):
+            if final_summary:
+                # prepend previous summary to next chunk
+                summary_message = Message(
+                    role='user',
+                    content=[TextContent(text=f'Previous summary:\n{final_summary}')],
+                    condensable=True,
+                    # Note: summary messages don't have an event_id
+                    event_id=-1,
+                )
+                chunk.insert(0, summary_message)
+
+            action_response = self._summarize_messages(chunk)
+            summary_action = parse_summary_response(action_response)
+            final_summary = summary_action.summary
+
+        # create final summary action
+        assert final_summary is not None, 'final summary must not be None here'
+        return AgentSummarizeAction(
+            summary=final_summary,
+            start_id=first_message_id,
+            end_id=last_real_message_id,
+        )
+
+    def _summarize_messages(self, message_sequence_to_summarize: list[Message]) -> str:
+        """Summarize a message sequence using LLM"""
+        # build the message to send
+        self.prompt_manager.conversation_history = self.llm.format_messages_for_llm(
+            message_sequence_to_summarize
+        )
+        summarize_prompt = self.prompt_manager.summarize_message
+        message = Message(role='system', content=[TextContent(text=summarize_prompt)])
+        serialized_message = message.model_dump()
+
+        response = self.llm.completion(
+            messages=[serialized_message],
+            temperature=0.2,
+        )
+
+        print(f'summarize_messages got response: {response}')
+        assert isinstance(response, ModelResponse), 'response must be a ModelResponse'
+        return response.choices[0].message.content
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -0,0 +1,115 @@
+from enum import Enum
+
+from openhands.controller.state.state import State
+from openhands.core.config.llm_config import LLMConfig
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_dict
+from openhands.llm.llm import LLM
+from openhands.memory.base_memory import Memory
+
+
+class StorageType(Enum):
+    IN_MEMORY = 'in-memory'
+    VECTOR = 'vector'
+
+
+class ConversationMemory(Memory):
+    """Allows the agent to recall events from its entire history, with support for summarization and recall.
+
+    This class handles the summarized events (from state.summary['start_id] to state.summary['end_id'])
+    and slices the history to include only the events after the summary.
+    """
+
+    memory: list[Event]
+    memory_config: LLMConfig
+
+    def __init__(
+        self,
+        memory_config: LLMConfig,
+        state: State,
+    ) -> None:
+        """
+        Initialize ConversationMemory with a reference to history and long-term memory.
+
+        Args:
+        - history: The history of the current agent conversation.
+        - llm_config: The LLM configuration.
+        - top_k: Number of top results to retrieve.
+        """
+        self.memory = []
+        self.memory_config = memory_config
+        # total messages in the conversation
+        # won't this always be the same as len(history)?
+        # core memory isn't counted here
+        self.total_message_count = 0
+        # of which hidden
+        self.hidden_message_count = 0
+
+        # init storage type
+        self.storage_type = StorageType.IN_MEMORY
+
+        # read itself from the runtime state
+        self.update(state)
+
+    def update(self, state: State) -> None:
+        """Updates the conversation memory from a new runtime state."""
+        # this isn't actually state.history
+        # if it has a summary, the messages from summary.start_id to summary.end_id are not included,
+        # but replaced with a single summary event
+        if state and state.summary:
+            self.memory = (
+                state.history[: state.summary.start_id]
+                + [state.summary]
+                + state.history[state.summary.end_id :]
+            )
+            self.hidden_message_count = state.summary.end_id - state.summary.start_id
+        else:
+            self.memory = state.history  # this is not cool but let it be for now
+            self.hidden_message_count = 0
+
+    def reset(self) -> None:
+        """Resets the conversation memory."""
+        self.memory = []
+        self.total_message_count = 0
+        self.hidden_message_count = 0
+
+    def update_summary(self, summary: str, hidden_count: int) -> None:
+        """Updates the memory with a new summary and tracks hidden messages."""
+        self.hidden_message_count = hidden_count
+
+    def to_dict(self) -> dict:
+        # return a dict with key = event.id, value = event.to_dict()
+        return {event.id: event_to_dict(event) for event in self.memory}
+
+    def __str__(self) -> str:
+        return f'ConversationMemory with {len(self.memory)} total events'
+
+    def search(self, llm: LLM, query: str, top_k: int = 5) -> list:
+        """Searches the conversation memory for relevant messages."""
+        if not self.memory or not query:
+            return []
+
+        if self.storage_type == StorageType.IN_MEMORY:
+            # use the llm.py search to find relevant messages
+            recalled_events = llm.search(query=query, history=self.memory, top_k=top_k)
+        else:
+            raise ValueError(f'Unsupported storage type: {self.storage_type}')
+
+        return recalled_events
+
+    def recall_memory(
+        self, llm: LLM, state: State, query: str, top_k: int = 5
+    ) -> list[Event]:
+        """
+        Get the most similar events based on the query.
+
+        Args:
+            query: The query string for semantic search.
+            top_k: Number of top results to retrieve.
+
+        Returns:
+            A list of semantically similar events.
+        """
+        # get the most similar events based on the query
+        # for testing recall with litellm
+        return llm.search(query, state.history, top_k)
--- a/openhands/memory/core_memory.py
+++ b/openhands/memory/core_memory.py
@@ -0,0 +1,60 @@
+from openhands.memory.base_memory import Memory
+
+
+class CoreMemory(Memory):
+    """Memory contents to be inserted in the prompt. This includes key facts and context
+    that the LLM needs to maintain about its current tasks and capabilities."""
+
+    def __init__(self, limit: int = 1500):
+        super().__init__()
+        self.char_limit = limit
+        self.blocks = {
+            'personality': [],  # agent's personality traits and capabilities
+            'task_context': [],  # important context about current tasks
+        }
+
+    def add_block(self, category: str, content: str) -> bool:
+        """Add a memory block to a specific category.
+        Returns True if successful, False if would exceed limit."""
+        if category not in self.blocks:
+            raise ValueError(
+                f'Invalid category: {category}. Must be one of {list(self.blocks.keys())}'
+            )
+
+        # Calculate total size with new content
+        potential_content = self.format_blocks() + f'\n- {content}'
+        if len(potential_content) > self.char_limit:
+            return False
+
+        self.blocks[category].append(content)
+        return True
+
+    def get_blocks(
+        self, category: str | None = None
+    ) -> dict[str, list[str]] | list[str]:
+        """Get memory blocks, optionally filtered by category."""
+        if category:
+            return self.blocks.get(category, [])
+        return self.blocks
+
+    def format_blocks(self) -> str:
+        """Format memory blocks for inclusion in the system prompt."""
+        formatted = []
+
+        for category, items in self.blocks.items():
+            if items:
+                formatted.append(f"\n{category.replace('_', ' ').title()}:")
+                formatted.extend([f'- {item}' for item in items])
+
+        return '\n'.join(formatted)
+
+    def __str__(self) -> str:
+        return self.format_blocks()
+
+    def to_dict(self) -> dict:
+        return {category: items for category, items in self.blocks.items()}
+
+    def reset(self) -> None:
+        """Reset all memory blocks."""
+        for category in self.blocks:
+            self.blocks[category] = []
--- a/openhands/memory/history.py
+++ b/openhands/memory/history.py
@@ -1,224 +0,0 @@
-from typing import ClassVar, Iterable
-
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action.action import Action
-from openhands.events.action.agent import (
-    AgentDelegateAction,
-    ChangeAgentStateAction,
-)
-from openhands.events.action.empty import NullAction
-from openhands.events.action.message import MessageAction
-from openhands.events.event import Event, EventSource
-from openhands.events.observation.agent import AgentStateChangedObservation
-from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.observation.empty import NullObservation
-from openhands.events.observation.observation import Observation
-from openhands.events.serialization.event import event_to_dict
-from openhands.events.stream import EventStream
-from openhands.events.utils import get_pairs_from_events
-
-
-class ShortTermHistory(list[Event]):
-    """A list of events that represents the short-term memory of the agent.
-
-    This class provides methods to retrieve and filter the events in the history of the running agent from the event stream.
-    """
-
-    start_id: int
-    end_id: int
-    _event_stream: EventStream
-    delegates: dict[tuple[int, int], tuple[str, str]]
-    filter_out: ClassVar[tuple[type[Event], ...]] = (
-        NullAction,
-        NullObservation,
-        ChangeAgentStateAction,
-        AgentStateChangedObservation,
-    )
-
-    def __init__(self):
-        super().__init__()
-        self.start_id = -1
-        self.end_id = -1
-        self.delegates = {}
-
-    def set_event_stream(self, event_stream: EventStream):
-        self._event_stream = event_stream
-
-    def get_events_as_list(self, include_delegates: bool = False) -> list[Event]:
-        """Return the history as a list of Event objects."""
-        return list(self.get_events(include_delegates=include_delegates))
-
-    def get_events(
-        self,
-        reverse: bool = False,
-        include_delegates: bool = False,
-        include_hidden=False,
-    ) -> Iterable[Event]:
-        """Return the events as a stream of Event objects."""
-        # TODO handle AgentRejectAction, if it's not part of a chunk ending with an AgentDelegateObservation
-        # or even if it is, because currently we don't add it to the summary
-
-        # iterate from start_id to end_id, or reverse
-        start_id = self.start_id if self.start_id != -1 else 0
-        end_id = (
-            self.end_id
-            if self.end_id != -1
-            else self._event_stream.get_latest_event_id()
-        )
-
-        for event in self._event_stream.get_events(
-            start_id=start_id,
-            end_id=end_id,
-            reverse=reverse,
-            filter_out_type=self.filter_out,
-        ):
-            if not include_hidden and hasattr(event, 'hidden') and event.hidden:
-                continue
-            # TODO add summaries
-            # and filter out events that were included in a summary
-
-            # filter out the events from a delegate of the current agent
-            if not include_delegates and not any(
-                # except for the delegate action and observation themselves, currently
-                # AgentDelegateAction has id = delegate_start
-                # AgentDelegateObservation has id = delegate_end
-                delegate_start < event.id < delegate_end
-                for delegate_start, delegate_end in self.delegates.keys()
-            ):
-                yield event
-            elif include_delegates:
-                yield event
-
-    def get_last_action(self, end_id: int = -1) -> Action | None:
-        """Return the last action from the event stream, filtered to exclude unwanted events."""
-        # from end_id in reverse, find the first action
-        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
-
-        last_action = next(
-            (
-                event
-                for event in self._event_stream.get_events(
-                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
-                )
-                if isinstance(event, Action)
-            ),
-            None,
-        )
-
-        return last_action
-
-    def get_last_observation(self, end_id: int = -1) -> Observation | None:
-        """Return the last observation from the event stream, filtered to exclude unwanted events."""
-        # from end_id in reverse, find the first observation
-        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
-
-        last_observation = next(
-            (
-                event
-                for event in self._event_stream.get_events(
-                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
-                )
-                if isinstance(event, Observation)
-            ),
-            None,
-        )
-
-        return last_observation
-
-    def get_last_user_message(self) -> str:
-        """Return the content of the last user message from the event stream."""
-        last_user_message = next(
-            (
-                event.content
-                for event in self._event_stream.get_events(reverse=True)
-                if isinstance(event, MessageAction) and event.source == EventSource.USER
-            ),
-            None,
-        )
-
-        return last_user_message if last_user_message is not None else ''
-
-    def get_last_agent_message(self) -> str:
-        """Return the content of the last agent message from the event stream."""
-        last_agent_message = next(
-            (
-                event.content
-                for event in self._event_stream.get_events(reverse=True)
-                if isinstance(event, MessageAction)
-                and event.source == EventSource.AGENT
-            ),
-            None,
-        )
-
-        return last_agent_message if last_agent_message is not None else ''
-
-    def get_last_events(self, n: int) -> list[Event]:
-        """Return the last n events from the event stream."""
-        # dummy agent is using this
-        # it should work, but it's not great to store temporary lists now just for a test
-        end_id = self._event_stream.get_latest_event_id()
-        start_id = max(0, end_id - n + 1)
-
-        return list(
-            event
-            for event in self._event_stream.get_events(
-                start_id=start_id,
-                end_id=end_id,
-                filter_out_type=self.filter_out,
-            )
-        )
-
-    def has_delegation(self) -> bool:
-        for event in self._event_stream.get_events():
-            if isinstance(event, AgentDelegateObservation):
-                return True
-        return False
-
-    def on_event(self, event: Event):
-        if not isinstance(event, AgentDelegateObservation):
-            return
-
-        logger.debug('AgentDelegateObservation received')
-
-        # figure out what this delegate's actions were
-        # from the last AgentDelegateAction to this AgentDelegateObservation
-        # and save their ids as start and end ids
-        # in order to use later to exclude them from parent stream
-        # or summarize them
-        delegate_end = event.id
-        delegate_start = -1
-        delegate_agent: str = ''
-        delegate_task: str = ''
-        for prev_event in self._event_stream.get_events(
-            end_id=event.id - 1, reverse=True
-        ):
-            if isinstance(prev_event, AgentDelegateAction):
-                delegate_start = prev_event.id
-                delegate_agent = prev_event.agent
-                delegate_task = prev_event.inputs.get('task', '')
-                break
-
-        if delegate_start == -1:
-            logger.error(
-                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
-            )
-            return
-
-        self.delegates[(delegate_start, delegate_end)] = (delegate_agent, delegate_task)
-        logger.debug(
-            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
-        )
-
-    # TODO remove me when unnecessary
-    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
-    # we rebuild the pairs here
-    # for compatibility with the existing output format in evaluations
-    def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
-        history_pairs = []
-
-        for action, observation in get_pairs_from_events(
-            self.get_events_as_list(include_delegates=True)
-        ):
-            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
-
-        return history_pairs
--- a/openhands/memory/memory.py
+++ b/openhands/memory/memory.py
@@ -54,10 +54,10 @@ class LongTermMemory:

        # instantiate the index
        self.index = VectorStoreIndex.from_vector_store(vector_store, self.embed_model)
-        self.thought_idx = 0

        # initialize the event stream
        self.event_stream = event_stream
+        self._events_to_docs()

        # max of threads to run the pipeline
        self.memory_max_threads = agent_config.memory_max_threads
@@ -85,18 +85,17 @@ class LongTermMemory:
            event_type = 'observation'
            event_id = event_data['observation']

-        # create a Document instance for the event
+        # create a Document instance for the event using event.id
        doc = Document(
            text=json.dumps(event_data),
-            doc_id=str(self.thought_idx),
+            doc_id=event.id,
            extra_info={
                'type': event_type,
                'id': event_id,
-                'idx': self.thought_idx,
+                'event_id': event.id,
            },
        )
-        self.thought_idx += 1
-        logger.debug('Adding %s event to memory: %d', event_type, self.thought_idx)
+        logger.debug('Adding %s event to memory with doc_id: %s', event_type, event.id)
        self._add_document(document=doc)

    def _add_document(self, document: 'Document'):
@@ -159,18 +158,17 @@ class LongTermMemory:
                    event_type = 'observation'
                    event_id = event_data['observation']

-                # create a Document instance for the event
+                # create a Document instance for the event using event.id
                doc = Document(
                    text=json.dumps(event_data),
-                    doc_id=str(self.thought_idx),
+                    doc_id=event.id,
                    extra_info={
                        'type': event_type,
                        'id': event_id,
-                        'idx': self.thought_idx,
+                        'event_id': event.id,
                    },
                )
                documents.append(doc)
-                self.thought_idx += 1
            except (json.JSONDecodeError, KeyError, ValueError) as e:
                logger.warning(f'Failed to process event: {e}')
                continue
--- a/openhands/memory/utils.py
+++ b/openhands/memory/utils.py
@@ -0,0 +1,39 @@
+import openhands.core.utils.json as json
+from openhands.core.exceptions import (
+    InvalidSummaryResponseError,
+    LLMMalformedActionError,
+    LLMResponseError,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.event import EventSource
+from openhands.events.serialization.event import action_from_dict
+
+
+def parse_summary_response(response: str) -> AgentSummarizeAction:
+    """
+    Parses a JSON summary of events.
+
+    Parameters:
+    - response: The response string to be parsed
+
+    Returns:
+    - The summary action output by the model
+    """
+    try:
+        action_dict = json.loads(response)
+        action = action_from_dict(action_dict)
+        if action is None or not isinstance(action, AgentSummarizeAction):
+            error_message = f'Expected a summarize action, but the response got {str(type(action)) if action else None}'
+            logger.error(error_message)
+            raise InvalidSummaryResponseError(error_message)
+        action._source = EventSource.AGENT  # type: ignore
+        action.summary = (
+            action.summarized_actions + '\n' + action.summarized_observations
+        )
+    except (LLMResponseError, LLMMalformedActionError) as e:
+        logger.error(f'Failed to parse summary response: {str(e)}')
+        raise InvalidSummaryResponseError(
+            f'Failed to parse the response: {str(e)}'
+        ) from e
+    return action
--- a/openhands/server/session/session.py
+++ b/openhands/server/session/session.py
@@ -78,6 +78,7 @@ class Session:
        self.agent_session.event_stream.add_event(
            AgentStateChangedObservation('', AgentState.LOADING), EventSource.AGENT
        )
+
        # Extract the agent-relevant arguments from the request
        args = {key: value for key, value in data.get('args', {}).items()}
        agent_cls = args.get(ConfigType.AGENT, self.config.default_agent)
@@ -102,6 +103,7 @@ class Session:

        # TODO: override other LLM config & agent config groups (#2075)

+        # set up the required arguments for the agent
        llm = LLM(config=self.config.get_llm_config_from_agent(agent_cls))
        agent_config = self.config.get_agent_config(agent_cls)
        agent = Agent.get_cls(agent_cls)(llm, agent_config)
--- a/openhands/utils/embeddings.py
+++ b/openhands/utils/embeddings.py
@@ -101,6 +101,12 @@ class EmbeddingsLoader:
                azure_endpoint=llm_config.base_url,
                api_version=llm_config.api_version,
            )
+        elif strategy == 'voyage':
+            from llama_index.legacy.embeddings.voyageai import VoyageEmbedding
+
+            return VoyageEmbedding(
+                model='voyageai/voyage-code-2',
+            )
        elif (strategy is not None) and (strategy.lower() == 'none'):
            # TODO: this works but is not elegant enough. The incentive is when
            # an agent using embeddings is not used, there is no reason we need to
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,6 +1,17 @@
+import importlib
 import os
+from inspect import signature
+from pathlib import Path
+from typing import Any

-from jinja2 import Template
+import yaml
+from jinja2 import (
+    Environment,
+    FileSystemLoader,
+    Template,
+    TemplateNotFound,
+    select_autoescape,
+)

 from openhands.utils.microagent import MicroAgent

@@ -17,34 +28,169 @@ class PromptManager:
        prompt_dir (str): Directory containing prompt templates.
        agent_skills_docs (str): Documentation of agent skills.
        micro_agent (MicroAgent | None): Micro-agent, if specified.
+        conversation_history (list[dict[str, Any]]): History of conversations.
+        core_memory (str): Core memory storage.
+        env (Environment): Jinja2 environment for template rendering.
+        templates (dict[str, Template]): Loaded templates.
+        available_skills (list[str]): List of available skills from YAML configuration.
    """

    def __init__(
        self,
-        prompt_dir: str,
-        agent_skills_docs: str,
+        prompt_dir: str | Path,
        micro_agent: MicroAgent | None = None,
    ):
-        self.prompt_dir: str = prompt_dir
-        self.agent_skills_docs: str = agent_skills_docs
+        """Initialize PromptManager with template directories and agent configuration.

-        self.system_template: Template = self._load_template('system_prompt')
-        self.user_template: Template = self._load_template('user_prompt')
-        self.micro_agent: MicroAgent | None = micro_agent
+        The system supports two types of templates:
+        1. Simple .md files - For basic customization with variable substitution
+        2. Advanced .j2 files - For complex templates using Jinja2 features
+
+        Templates are loaded in this order (later ones override earlier ones):
+        1. Default templates from prompt_dir
+        2. Custom templates from custom_prompt_dir
+        3. .j2 files take precedence over .md files with the same base nam
+        """
+
+        self.prompt_dir = os.path.abspath(prompt_dir)
+        self.micro_agent = micro_agent
+        self.conversation_history: list[dict[str, Any]] = []
+        self.core_memory: str = ''
+
+        # load available skills from YAML
+        yaml_path = os.path.join(prompt_dir, 'agent.yaml')
+        if os.path.exists(yaml_path):
+            with open(yaml_path, 'r') as f:
+                config = yaml.safe_load(f)
+
+            custom_templates_dir = config.get('custom_templates_dir', None)
+            if custom_templates_dir:
+                # custom templates directory is an absolute path or relative to the script location
+                custom_templates_dir = os.path.abspath(custom_templates_dir)
+
+                # prioritize custom_templates_dir over the default templates directory
+                self.env = Environment(
+                    loader=FileSystemLoader([custom_templates_dir, self.prompt_dir]),
+                    autoescape=select_autoescape(['j2', 'md']),
+                    trim_blocks=True,
+                    lstrip_blocks=True,
+                )
+
+            self._system_template = self._load_template(
+                config['template']['system_prompt']
+            )
+            self._agent_skills_template = self._load_template(
+                config['template']['agent_skills']
+            )
+            self._examples_template = self._load_template(
+                config['template']['examples']
+            )
+            self._user_template = self._load_template(config['template']['user_prompt'])
+
+            self.available_skills = config['agent_skills']['available_skills']
+        else:
+            # no agent.yaml file found, use the default templates
+            self.env = Environment(loader=FileSystemLoader(prompt_dir))
+
+            self._system_template = self._load_template('system_prompt')
+            self._agent_skills_template = self._load_template('agent_skills')
+            self._user_template = self._load_template('user_prompt')
+            self._examples_template = self._load_template('examples')
+
+            self.available_skills = []  # FIXME: default to empty list if YAML not found
+
+        # TODO: agent config should have a tool use enabled or disabled
+        # and we can use that to conditionally load the tools variant of agentskills
+
+        # Load all templates
+        self.templates = self._load_templates()

    def _load_template(self, template_name: str) -> Template:
-        template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
-        if not os.path.exists(template_path):
-            raise FileNotFoundError(f'Prompt file {template_path} not found')
-        with open(template_path, 'r') as file:
-            return Template(file.read())
+        """Load a template from the environment."""
+        # use the jinja2 environment to load the template
+        try:
+            return self.env.get_template(f'{template_name}.j2')
+        except TemplateNotFound:
+            # try to load from the prompt_dir
+            template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
+            if not os.path.exists(template_path):
+                raise FileNotFoundError(f'Prompt file {template_path} not found')
+            with open(template_path, 'r') as file:
+                return Template(file.read())
+
+    def _load_templates(self) -> dict[str, Template]:
+        """Load templates with appropriate extensions based on complexity.
+
+        For each template name (e.g. 'system_prompt'), checks for files in this order:
+        1. {name}.j2 in custom_prompt_dir (if provided)
+        2. {name}.md in custom_prompt_dir (if provided)
+        3. {name}.j2 in prompt_dir
+        4. {name}.md in prompt_dir
+
+        Returns:
+            A dictionary mapping template names to their loaded Template objects.
+        """
+        templates = {}
+
+        # Template names and their default types
+        template_configs = {
+            # Complex templates that typically need Jinja features
+            'system_prompt': '.j2',
+            'summarize_prompt': '.j2',
+            # Simple templates that work well as markdown
+            'user_prompt': '.md',
+            'examples': '.md',
+        }
+
+        for name, default_ext in template_configs.items():
+            # Try loading template with either extension
+            template = None
+            for ext in ['.j2', '.md']:
+                try:
+                    template = self.env.get_template(f'{name}{ext}')
+                    break
+                except TemplateNotFound:
+                    continue
+
+            # If no template found, create empty one
+            if template is None:
+                print(f'No template found for {name}, using empty template')
+                template = self.env.from_string('')
+
+            templates[name] = template
+
+        return templates
+
+    def get_template_variables(self) -> dict[str, Any]:
+        """Get the current template variables.
+
+        Returns:
+            Dictionary of variables available to templates.
+        """
+        return {
+            'core_memory': self.core_memory,
+            'conversation_history': self.conversation_history,
+            'micro_agent': self.micro_agent.content if self.micro_agent else None,
+        }

    @property
    def system_message(self) -> str:
-        rendered = self.system_template.render(
-            agent_skills_docs=self.agent_skills_docs,
+        """Render the system message template."""
+        # render the agent_skills.j2 template
+
+        self.env.globals['get_skill_docstring'] = self._get_skill_docstring
+        rendered_docs = self._agent_skills_template.render(
+            available_skills=self.available_skills
+        )
+        rendered = self._system_template.render(
+            agent_skills_docs=rendered_docs,
        ).strip()
        return rendered
+        # return (
+        #    self.templates['system_prompt']
+        #    .render(**self.get_template_variables())
+        #    .strip()
+        # )

    @property
    def initial_user_message(self) -> str:
@@ -57,7 +203,58 @@ class PromptManager:
        These additional context will convert the current generic agent
        into a more specialized agent that is tailored to the user's task.
        """
-        rendered = self.user_template.render(
-            micro_agent=self.micro_agent.content if self.micro_agent else None
+        # this should render the examples.j2 template first, then the user_prompt.j2 template
+        rendered_examples = self._examples_template.render()
+        rendered = self._user_template.render(
+            examples=rendered_examples,
+            micro_agent=self.micro_agent.content if self.micro_agent else None,
        )
        return rendered.strip()
+
+        # return (
+        #    self.templates['user_prompt']
+        #    .render(**self.get_template_variables())
+        #    .strip()
+        # )
+
+    @property
+    def summarize_message(self) -> str:
+        """Render the summarize message template."""
+        return (
+            self.templates['summarize_prompt']
+            .render(**self.get_template_variables())
+            .strip()
+        )
+
+    def _get_skill_docstring(self, skill_name: str) -> str:
+        """Retrieves the docstring of a skill function.
+
+        Args:
+            skill_name: The name of the skill in the format 'module:function'.
+
+        Returns:
+            A formatted string containing the function signature and docstring.
+        """
+        module_name, function_name = skill_name.split(':')
+        try:
+            module = importlib.import_module(
+                f'openhands.runtime.plugins.agent_skills.{module_name}'
+            )
+
+            # find the function
+            agent_skill_fn = getattr(module, function_name)
+
+            # get the function signature with parameter names, types and return type
+            fn_signature = f'{agent_skill_fn.__name__}' + str(signature(agent_skill_fn))
+
+            doc = agent_skill_fn.__doc__
+
+            # remove indentation from docstring and extra empty lines
+            doc = '\n'.join(filter(None, map(lambda x: x.strip(), doc.split('\n'))))
+
+            # now add a consistent 4 indentation
+            doc = '\n'.join(map(lambda x: ' ' * 4 + x, doc.split('\n')))
+            return f'{fn_signature}\n{doc}'
+        except (ImportError, AttributeError) as e:
+            print(e)
+            return f'Documentation not found for skill: {skill_name}'
--- a/poetry.lock
+++ b/poetry.lock
@@ -3924,72 +3924,88 @@ tokenizers = "*"
 extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
 proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"]

+[[package]]
+name = "llama-cloud"
+version = "0.1.4"
+description = ""
+optional = false
+python-versions = "<4,>=3.8"
+files = [
+    {file = "llama_cloud-0.1.4-py3-none-any.whl", hash = "sha256:cfca6c4e0a87468b922d732f0f313a2ecd3a8e0bf74382ee80829ce49dcbc5e0"},
+    {file = "llama_cloud-0.1.4.tar.gz", hash = "sha256:6f0155979bd96160951cb812c48836f1face037bc79ccfd8d185b18ef4c9faf8"},
+]
+
+[package.dependencies]
+httpx = ">=0.20.0"
+pydantic = ">=1.10"
+
 [[package]]
 name = "llama-index"
-version = "0.10.45.post1"
+version = "0.11.20"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index-0.10.45.post1-py3-none-any.whl", hash = "sha256:11ff93431a81f1eae5bb73092d194cfb66a36ea90f272ea145f20e6e4324c71c"},
-    {file = "llama_index-0.10.45.post1.tar.gz", hash = "sha256:0bd3dcdbbfa468c408ad2f9e839b60562367ec6563c13c9bddcd108309881447"},
+    {file = "llama_index-0.11.20-py3-none-any.whl", hash = "sha256:fc9e5e47e6da3610bc3b788d208bb782c03a342fd71e3b22b37abc83ecebe46e"},
+    {file = "llama_index-0.11.20.tar.gz", hash = "sha256:5e8e3fcb5af5b4e4525498b075ff0a54160b00bf0fc0b83801fc7faf1c8a8c1d"},
 ]

 [package.dependencies]
-llama-index-agent-openai = ">=0.1.4,<0.3.0"
-llama-index-cli = ">=0.1.2,<0.2.0"
-llama-index-core = "0.10.45"
-llama-index-embeddings-openai = ">=0.1.5,<0.2.0"
-llama-index-indices-managed-llama-cloud = ">=0.1.2,<0.2.0"
+llama-index-agent-openai = ">=0.3.4,<0.4.0"
+llama-index-cli = ">=0.3.1,<0.4.0"
+llama-index-core = ">=0.11.20,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.4,<0.3.0"
+llama-index-indices-managed-llama-cloud = ">=0.3.0"
 llama-index-legacy = ">=0.9.48,<0.10.0"
-llama-index-llms-openai = ">=0.1.13,<0.2.0"
-llama-index-multi-modal-llms-openai = ">=0.1.3,<0.2.0"
-llama-index-program-openai = ">=0.1.3,<0.2.0"
-llama-index-question-gen-openai = ">=0.1.2,<0.2.0"
-llama-index-readers-file = ">=0.1.4,<0.2.0"
-llama-index-readers-llama-parse = ">=0.1.2,<0.2.0"
+llama-index-llms-openai = ">=0.2.10,<0.3.0"
+llama-index-multi-modal-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-program-openai = ">=0.2.0,<0.3.0"
+llama-index-question-gen-openai = ">=0.2.0,<0.3.0"
+llama-index-readers-file = ">=0.2.0,<0.3.0"
+llama-index-readers-llama-parse = ">=0.3.0"
+nltk = ">3.8.1"

 [[package]]
 name = "llama-index-agent-openai"
-version = "0.2.9"
+version = "0.3.4"
 description = "llama-index agent openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_agent_openai-0.2.9-py3-none-any.whl", hash = "sha256:d7f0fd4c87124781acd783be603871f8808b1a3969e876a9c96e2ed0844d46ac"},
-    {file = "llama_index_agent_openai-0.2.9.tar.gz", hash = "sha256:debe86da6d9d983db32b445ddca7c798ac140fe59573bafded73595b3995f3d5"},
+    {file = "llama_index_agent_openai-0.3.4-py3-none-any.whl", hash = "sha256:3720ce9bb12417a99a3fe84e52cce23e762b13f88a2dfc4292c76f4df9b26b4a"},
+    {file = "llama_index_agent_openai-0.3.4.tar.gz", hash = "sha256:80e3408d97121bebca3fa3ffd14b51285870c1c3c73d4ee04d3d18cfe6040466"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.41,<0.11.0"
-llama-index-llms-openai = ">=0.1.5,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.9,<0.3.0"
 openai = ">=1.14.0"

 [[package]]
 name = "llama-index-cli"
-version = "0.1.13"
+version = "0.3.1"
 description = "llama-index cli"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_cli-0.1.13-py3-none-any.whl", hash = "sha256:5e05bc3ce55ee1bf6e5af7e87631a71d6b6cf8fc2af10cd3947b09b1bac6788d"},
-    {file = "llama_index_cli-0.1.13.tar.gz", hash = "sha256:86147ded4439fbab1d6c7c0d72e8f231d2935da9fdf5c9d3f0dde4f35d44aa59"},
+    {file = "llama_index_cli-0.3.1-py3-none-any.whl", hash = "sha256:2111fbb6973f5b1eabce0d6cca3986499f0f2f625b13d7f48269a49c64c027d4"},
+    {file = "llama_index_cli-0.3.1.tar.gz", hash = "sha256:1890dd687cf440f3651365a549e303363162c167b8efbd87a3aa10058d6d5c77"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-embeddings-openai = ">=0.1.1,<0.2.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.0,<0.3.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"

 [[package]]
 name = "llama-index-core"
-version = "0.10.45"
+version = "0.11.20"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_core-0.10.45-py3-none-any.whl", hash = "sha256:8c800c7221322b8e1cbbbc13325039b5fe3575d4b0e0be14ac9a8f1e5d14fee3"},
-    {file = "llama_index_core-0.10.45.tar.gz", hash = "sha256:f32d0448e7193ff45c8e84abd49493be030998fc8f1a0cab069387deef3e577c"},
+    {file = "llama_index_core-0.11.20-py3-none-any.whl", hash = "sha256:e84daf45e90e4b5d9e135baf40ab9853a1c3169a1076af6d58739d098e70adb1"},
+    {file = "llama_index_core-0.11.20.tar.gz", hash = "sha256:6b5eaaf4be5030808b9ba953e8f7aead7ba495b8e72ba0a81dfc7dda96be416f"},
 ]

 [package.dependencies]
@@ -3999,18 +4015,16 @@ deprecated = ">=1.2.9.3"
 dirtyjson = ">=1.0.8,<2.0.0"
 fsspec = ">=2023.5.0"
 httpx = "*"
-llamaindex-py-client = ">=0.1.18,<0.2.0"
 nest-asyncio = ">=1.5.8,<2.0.0"
 networkx = ">=3.0"
-nltk = ">=3.8.1,<4.0.0"
-numpy = "*"
-openai = ">=1.1.0"
-pandas = "*"
+nltk = ">3.8.1"
+numpy = "<2.0.0"
 pillow = ">=9.0.0"
+pydantic = ">=2.7.0,<3.0.0"
 PyYAML = ">=6.0.1"
 requests = ">=2.31.0"
 SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]}
-tenacity = ">=8.2.0,<9.0.0"
+tenacity = ">=8.2.0,<8.4.0 || >8.4.0,<9.0.0"
 tiktoken = ">=0.3.3"
 tqdm = ">=4.66.1,<5.0.0"
 typing-extensions = ">=4.5.0"
@@ -4019,79 +4033,95 @@ wrapt = "*"

 [[package]]
 name = "llama-index-embeddings-azure-openai"
-version = "0.1.11"
+version = "0.2.5"
 description = "llama-index embeddings azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_azure_openai-0.1.11-py3-none-any.whl", hash = "sha256:afefe55ee69934528c569ddf71fb1e9ddf2992b6c344c4c9d72a03fa8c33cf40"},
-    {file = "llama_index_embeddings_azure_openai-0.1.11.tar.gz", hash = "sha256:40a4fd9a31ba74f071739d6c8405187b66e7f584ae2f64a30316c6c7b6a25325"},
+    {file = "llama_index_embeddings_azure_openai-0.2.5-py3-none-any.whl", hash = "sha256:e3384002618d027c3d188134e7fe09ffb16029202db6b3e6955a9f1f6d591a3e"},
+    {file = "llama_index_embeddings_azure_openai-0.2.5.tar.gz", hash = "sha256:d8b2e3134c2b3510214f2260e6c17be18396d0c765f3edd6c3ffe6109528aed0"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-embeddings-openai = ">=0.1.3,<0.2.0"
-llama-index-llms-azure-openai = ">=0.1.3,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.3,<0.3.0"
+llama-index-llms-azure-openai = ">=0.2.0,<0.3.0"

 [[package]]
 name = "llama-index-embeddings-huggingface"
-version = "0.2.3"
+version = "0.3.1"
 description = "llama-index embeddings huggingface integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_huggingface-0.2.3-py3-none-any.whl", hash = "sha256:7dee842f938d5fa8992e7803eda8a14f6bea72ec0bc0a546f4c6aa455166cde5"},
-    {file = "llama_index_embeddings_huggingface-0.2.3.tar.gz", hash = "sha256:6fe54366eeb87ff81b50624d6b8ccca4230f8035fcc19a0b0b3f31c6d8a82f8b"},
+    {file = "llama_index_embeddings_huggingface-0.3.1-py3-none-any.whl", hash = "sha256:71708240b1aec183c80f20d531b39a75d0cce774586e11bb0798f3ecb270749c"},
+    {file = "llama_index_embeddings_huggingface-0.3.1.tar.gz", hash = "sha256:7aef6324a19576e6b95bfe927c3bd4fc1c5725edce9f26b4e5d2eefa27c02fdb"},
 ]

 [package.dependencies]
 huggingface-hub = {version = ">=0.19.0", extras = ["inference"]}
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 sentence-transformers = ">=2.6.1"

 [[package]]
 name = "llama-index-embeddings-ollama"
-version = "0.2.0"
+version = "0.3.1"
 description = "llama-index embeddings ollama integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_ollama-0.2.0-py3-none-any.whl", hash = "sha256:372b059321386bd9bbf4f619ad33dd551adb9ee92eeeb0c664d3466f7c212e2e"},
-    {file = "llama_index_embeddings_ollama-0.2.0.tar.gz", hash = "sha256:5673c740e1dd146e17d1c0401c1e179c0d559caf0967f4a4721b89fbb6822ad8"},
+    {file = "llama_index_embeddings_ollama-0.3.1-py3-none-any.whl", hash = "sha256:b869ce7e9f8e67aa7d81336e90d25d3ea1fca91c68dce8922b2d4b9c06c5acef"},
+    {file = "llama_index_embeddings_ollama-0.3.1.tar.gz", hash = "sha256:5a3e75fa14be7e2b1a82937416c880204dc96e1b1d2626dc5bde93f021e7b540"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 ollama = ">=0.3.1,<0.4.0"

 [[package]]
 name = "llama-index-embeddings-openai"
-version = "0.1.11"
+version = "0.2.5"
 description = "llama-index embeddings openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_openai-0.1.11-py3-none-any.whl", hash = "sha256:e20806fc4baff6b8f5274decf2c1ca7c5c737648e01865475ffada164e32e173"},
-    {file = "llama_index_embeddings_openai-0.1.11.tar.gz", hash = "sha256:6025e229e375201788a9b14d6ebe470329907576cba5f6b7b832c3d68f39db30"},
+    {file = "llama_index_embeddings_openai-0.2.5-py3-none-any.whl", hash = "sha256:823c8311e556349ba19dda408a64a314fa3dafe0e5759709c54d33a0269aa6ba"},
+    {file = "llama_index_embeddings_openai-0.2.5.tar.gz", hash = "sha256:0047dd71d747068645ed728c29312aa91b65bbe4c6142180034c64dfc5c6f6e8"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+openai = ">=1.1.0"
+
+[[package]]
+name = "llama-index-embeddings-voyageai"
+version = "0.2.2"
+description = "llama-index embeddings voyageai integration"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "llama_index_embeddings_voyageai-0.2.2-py3-none-any.whl", hash = "sha256:7bbb79558d474497ff700a930a0f9081976d1b4e0f5107e38a1059600de92c58"},
+    {file = "llama_index_embeddings_voyageai-0.2.2.tar.gz", hash = "sha256:237f70074af05f3b950c89d5d0720de30f9f5e98426a420f6e08125600b69be9"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.11.0,<0.12.0"
+voyageai = ">=0.2.1,<0.3.0"

 [[package]]
 name = "llama-index-indices-managed-llama-cloud"
-version = "0.1.6"
+version = "0.4.0"
 description = "llama-index indices llama-cloud integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_indices_managed_llama_cloud-0.1.6-py3-none-any.whl", hash = "sha256:cba33e1a3677b2a2ae7f239119acbf6dc3818f105edc92315729842b56fbc949"},
-    {file = "llama_index_indices_managed_llama_cloud-0.1.6.tar.gz", hash = "sha256:74b3b0e9ebf9d348d3054f9fc0c657031acceb9351c31116ad8d5a7ae4729f5c"},
+    {file = "llama_index_indices_managed_llama_cloud-0.4.0-py3-none-any.whl", hash = "sha256:c2c54821f1bf17a7810e6c013fbe7ddfef4154b7e5b100f7bf8673098f8004e4"},
+    {file = "llama_index_indices_managed_llama_cloud-0.4.0.tar.gz", hash = "sha256:fbebff7876a219b6ab96892ae7c432a9299195fab8f67d4a4a0ebf6da210b242"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.0,<0.11.0"
-llamaindex-py-client = ">=0.1.19,<0.2.0"
+llama-cloud = ">=0.0.11"
+llama-index-core = ">=0.11.13.post1,<0.12.0"

 [[package]]
 name = "llama-index-legacy"
@@ -4134,96 +4164,98 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l

 [[package]]
 name = "llama-index-llms-azure-openai"
-version = "0.1.10"
+version = "0.2.2"
 description = "llama-index llms azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_azure_openai-0.1.10-py3-none-any.whl", hash = "sha256:8666b095118ed9c5087dc2d91a83a826d4549ea4d442b9eef363e243207d3539"},
-    {file = "llama_index_llms_azure_openai-0.1.10.tar.gz", hash = "sha256:f1624c9bd7bf4458e98cca6f3b805eec06105fa951536ff24b098d913d2368bd"},
+    {file = "llama_index_llms_azure_openai-0.2.2-py3-none-any.whl", hash = "sha256:c8a7d04a111ceff0b4335dc9273fbdb37fdb5095b6234190ca727736f6466d7b"},
+    {file = "llama_index_llms_azure_openai-0.2.2.tar.gz", hash = "sha256:717bc3bf858e800d66e4f2ddec85a2e7dd503006d55981053d08e98771ec3abc"},
 ]

 [package.dependencies]
 azure-identity = ">=1.15.0,<2.0.0"
 httpx = "*"
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.1,<0.3.0"

 [[package]]
 name = "llama-index-llms-openai"
-version = "0.1.26"
+version = "0.2.16"
 description = "llama-index llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_openai-0.1.26-py3-none-any.whl", hash = "sha256:1ad8e4eb02f9410c2091749d4d9aa9db4452646b595eb5eb937edbc496fb65fe"},
-    {file = "llama_index_llms_openai-0.1.26.tar.gz", hash = "sha256:08a408cd53af4cd4623dd5807be4cbbd5e5b3ca01272128cd678d667343e4d5d"},
+    {file = "llama_index_llms_openai-0.2.16-py3-none-any.whl", hash = "sha256:413466acbb894bd81f8dab2037f595e92392d869eec6d8274a16d43123cac8b6"},
+    {file = "llama_index_llms_openai-0.2.16.tar.gz", hash = "sha256:7c666dd27056c278a079ff45d53f1fbfc8ed363764aa7baeee2e03df47f9072a"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.24,<0.11.0"
+llama-index-core = ">=0.11.7,<0.12.0"
+openai = ">=1.40.0,<2.0.0"

 [[package]]
 name = "llama-index-multi-modal-llms-openai"
-version = "0.1.9"
+version = "0.2.3"
 description = "llama-index multi-modal-llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_multi_modal_llms_openai-0.1.9-py3-none-any.whl", hash = "sha256:614f40427a4671e72742780be8fda77297dbf2942519bffcb2c9de8696a9edff"},
-    {file = "llama_index_multi_modal_llms_openai-0.1.9.tar.gz", hash = "sha256:dbacf44d5c2cca07ca424eacd1337583002d70387a3c1868cf8ae743b1dbec4a"},
+    {file = "llama_index_multi_modal_llms_openai-0.2.3-py3-none-any.whl", hash = "sha256:96b36beb2c3fca4faca80c59ecf7c6c6629ecdb96c288ef89777b592ec43f872"},
+    {file = "llama_index_multi_modal_llms_openai-0.2.3.tar.gz", hash = "sha256:8eb9b7f1ff3956ef0979e21bc83e6a885e40987b7199f195e46525d06e3ae402"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.11,<0.3.0"

 [[package]]
 name = "llama-index-program-openai"
-version = "0.1.6"
+version = "0.2.0"
 description = "llama-index program openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_program_openai-0.1.6-py3-none-any.whl", hash = "sha256:4660b338503537c5edca1e0dab606af6ce372b4f1b597e2833c6b602447c5d8d"},
-    {file = "llama_index_program_openai-0.1.6.tar.gz", hash = "sha256:c6a4980c5ea826088b28b4dee3367edb20221e6d05eb0e05019049190131d772"},
+    {file = "llama_index_program_openai-0.2.0-py3-none-any.whl", hash = "sha256:2e10d0c8f21af2e9443eb79e81bb31e7b73835b7c7bbd7ddf20e0a9c846cd368"},
+    {file = "llama_index_program_openai-0.2.0.tar.gz", hash = "sha256:4139935541c011257fbfeb9662b3bf1237b729ef4b1c8f4ddf5b6789d2374ac4"},
 ]

 [package.dependencies]
-llama-index-agent-openai = ">=0.1.1,<0.3.0"
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-agent-openai = ">=0.3.0,<0.4.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"

 [[package]]
 name = "llama-index-question-gen-openai"
-version = "0.1.3"
+version = "0.2.0"
 description = "llama-index question_gen openai integration"
 optional = false
-python-versions = ">=3.8.1,<4.0"
+python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_question_gen_openai-0.1.3-py3-none-any.whl", hash = "sha256:1f83b49e8b2e665030d1ec8c54687d6985d9fa8426147b64e46628a9e489b302"},
-    {file = "llama_index_question_gen_openai-0.1.3.tar.gz", hash = "sha256:4486198117a45457d2e036ae60b93af58052893cc7d78fa9b6f47dd47b81e2e1"},
+    {file = "llama_index_question_gen_openai-0.2.0-py3-none-any.whl", hash = "sha256:a16e68fc5434e9a793f1dfd0cc0354ee19afd167f1d499403b0085b11c5406c0"},
+    {file = "llama_index_question_gen_openai-0.2.0.tar.gz", hash = "sha256:3dde1cecbd651000639c20031d7ea23334276aabb181cac40ff424f35e10465e"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
-llama-index-program-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-program-openai = ">=0.2.0,<0.3.0"

 [[package]]
 name = "llama-index-readers-file"
-version = "0.1.33"
+version = "0.2.2"
 description = "llama-index readers file integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_readers_file-0.1.33-py3-none-any.whl", hash = "sha256:c968308497c1355acf61fe7e3f05ad8e308bb6487dddd3bd2a60e102225d0b38"},
-    {file = "llama_index_readers_file-0.1.33.tar.gz", hash = "sha256:247a4d5bfabc7d1022027adf58064bc16c224d006db142abb0d182ac5574a887"},
+    {file = "llama_index_readers_file-0.2.2-py3-none-any.whl", hash = "sha256:ffec878771c1e7575afb742887561059bcca77b97a81c1c1be310ebb73f10f46"},
+    {file = "llama_index_readers_file-0.2.2.tar.gz", hash = "sha256:48459f90960b863737147b66ed83afec9ce8984f8eda2561b6d2500214365db2"},
 ]

 [package.dependencies]
 beautifulsoup4 = ">=4.12.3,<5.0.0"
-llama-index-core = ">=0.10.37.post1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+pandas = "*"
 pypdf = ">=4.0.1,<5.0.0"
 striprtf = ">=0.0.26,<0.0.27"

@@ -4232,62 +4264,48 @@ pymupdf = ["pymupdf (>=1.23.21,<2.0.0)"]

 [[package]]
 name = "llama-index-readers-llama-parse"
-version = "0.1.6"
+version = "0.3.0"
 description = "llama-index readers llama-parse integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_readers_llama_parse-0.1.6-py3-none-any.whl", hash = "sha256:71d445a2357ce4c632e0fada7c913ac62790e77c062f12d916dd86378380ff1f"},
-    {file = "llama_index_readers_llama_parse-0.1.6.tar.gz", hash = "sha256:04f2dcfbb0fb87ce70890f5a2f4f89941d79be6a818b43738f053560e4b451cf"},
+    {file = "llama_index_readers_llama_parse-0.3.0-py3-none-any.whl", hash = "sha256:1973cc710dbd5e110c7500c9983ecb45787ad1ff92e6b2113f94a57cf48f3038"},
+    {file = "llama_index_readers_llama_parse-0.3.0.tar.gz", hash = "sha256:a5feada0895714dcc41d65dd512c1c38cf70d8ae19947cff82b80d58e6aa367e"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.7,<0.11.0"
-llama-parse = ">=0.4.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-parse = ">=0.5.0"

 [[package]]
 name = "llama-index-vector-stores-chroma"
-version = "0.1.10"
+version = "0.2.1"
 description = "llama-index vector_stores chroma integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_vector_stores_chroma-0.1.10-py3-none-any.whl", hash = "sha256:18859272ec8d3ed20bae7e4a9bc18feb4233e8be2a725d33626f283ac41d1475"},
-    {file = "llama_index_vector_stores_chroma-0.1.10.tar.gz", hash = "sha256:97971f7b36461ef37be023b9ceb5531396cc48360d0bdbda51cce1290301cc47"},
+    {file = "llama_index_vector_stores_chroma-0.2.1-py3-none-any.whl", hash = "sha256:6dcca6450d298d3033a47b2131d0618ad48c172a3541eb6c790a61bf94136fed"},
+    {file = "llama_index_vector_stores_chroma-0.2.1.tar.gz", hash = "sha256:def15a76354bb4658b16badb92537a72e766273d5e566b0575461005da53847f"},
 ]

 [package.dependencies]
-chromadb = ">=0.4.0,<0.6.0"
-llama-index-core = ">=0.10.1,<0.11.0"
+chromadb = ">=0.4.0,<0.5.4 || >0.5.4,<0.5.7 || >0.5.7,<0.5.9 || >0.5.9,<0.5.10 || >0.5.10,<0.5.11 || >0.5.11,<0.5.12 || >0.5.12,<0.6.0"
+llama-index-core = ">=0.11.0,<0.12.0"

 [[package]]
 name = "llama-parse"
-version = "0.4.9"
+version = "0.5.12"
 description = "Parse files into RAG-Optimized formats."
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_parse-0.4.9-py3-none-any.whl", hash = "sha256:71974a57a73d642608cc406942bee4e7fc1a713fa410f51df67da509479ba544"},
-    {file = "llama_parse-0.4.9.tar.gz", hash = "sha256:657f8fa5f7d399f14c0454fc05cae6034da0373f191df6cfca17a1b4a704ef87"},
+    {file = "llama_parse-0.5.12-py3-none-any.whl", hash = "sha256:6011feb49da5db4bcbeea1cc6688b6ff24b483877fda80b03fe59239cd08b907"},
+    {file = "llama_parse-0.5.12.tar.gz", hash = "sha256:e241606cf3574425df76c0f5d01a31a95c792c6fbef80aaf72f8ed6448bd1715"},
 ]

 [package.dependencies]
-llama-index-core = ">=0.10.29"
-
-[[package]]
-name = "llamaindex-py-client"
-version = "0.1.19"
-description = ""
-optional = false
-python-versions = "<4,>=3.8"
-files = [
-    {file = "llamaindex_py_client-0.1.19-py3-none-any.whl", hash = "sha256:fd9416fd78b97209bf323bc3c7fab314499778563e7274f10853ad560563d10e"},
-    {file = "llamaindex_py_client-0.1.19.tar.gz", hash = "sha256:73f74792bb8c092bae6dc626627a09ac13a099fa8d10f8fcc83e17a2b332cca7"},
-]
-
-[package.dependencies]
-httpx = ">=0.20.0"
-pydantic = ">=1.10"
+click = ">=8.1.7,<9.0.0"
+llama-index-core = ">=0.11.0"

 [[package]]
 name = "lxml"
@@ -5298,56 +5316,47 @@ test = ["pytest", "pytest-console-scripts", "pytest-jupyter", "pytest-tornasync"

 [[package]]
 name = "numpy"
-version = "2.0.2"
+version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"},
-    {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"},
-    {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"},
-    {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"},
-    {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"},
-    {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"},
-    {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"},
-    {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"},
-    {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"},
-    {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]

 [[package]]
@@ -9354,6 +9363,24 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]

+[[package]]
+name = "voyageai"
+version = "0.2.4"
+description = ""
+optional = false
+python-versions = "<4.0.0,>=3.7.1"
+files = [
+    {file = "voyageai-0.2.4-py3-none-any.whl", hash = "sha256:e3070e5c78dec89adae43231334b4637aa88933dad99b1c33d3219fdfc94dfa4"},
+    {file = "voyageai-0.2.4.tar.gz", hash = "sha256:b9911d8629e8a4e363291c133482fead49a3536afdf1e735f3ab3aaccd8d250d"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.5,<4.0"
+aiolimiter = ">=1.1.0,<2.0.0"
+numpy = ">=1.11"
+requests = ">=2.20,<3.0"
+tenacity = ">=8.0.1"
+
 [[package]]
 name = "watchdog"
 version = "5.0.3"
@@ -10095,4 +10122,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "aeb09e429a789c3f8ced605e7e1a5932fd6cce7f7f4ce30a960da77fba18b9a3"
+content-hash = "62de6b5fb79f97f563a3ff6a4cf225cc639954745b63ddea8921b2eb9fb0e155"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,7 @@ llama-index-embeddings-huggingface = "*"
 torch = "2.5.0"
 llama-index-embeddings-azure-openai = "*"
 llama-index-embeddings-ollama = "*"
+llama-index-embeddings-voyageai = "*"

 [tool.poetry.group.dev.dependencies]
 ruff = "0.7.1"
@@ -89,6 +90,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -119,6 +121,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@@ -92,5 +92,4 @@ def test_error_observation_message(agent: CodeActAgent):
 def test_unknown_observation_message(agent: CodeActAgent):
    obs = Mock()

-    with pytest.raises(ValueError, match='Unknown observation type:'):
-        agent.get_observation_message(obs)
+    assert agent.get_observation_message(obs) is None
--- a/tests/unit/test_condenser.py
+++ b/tests/unit/test_condenser.py
@@ -1,44 +1,172 @@
-from unittest.mock import Mock, patch
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path

-import pytest
-
-from openhands.core.exceptions import LLMResponseError
+from openhands.core import logger
+from openhands.core.config.utils import get_llm_config_arg, load_app_config
+from openhands.core.message import Message, TextContent
+from openhands.events.action.agent import AgentSummarizeAction
 from openhands.llm.llm import LLM
 from openhands.memory.condenser import MemoryCondenser
+from openhands.utils.prompt import PromptManager


-@pytest.fixture
-def memory_condenser():
-    return MemoryCondenser()
+def save_messages_for_debugging(
+    messages: list[Message], summary_action: AgentSummarizeAction
+) -> None:
+    """
+    Serializes the list of Message objects and the summary action,
+    then saves them to a JSON file in the ./logs directory for debugging purposes.
+
+    Args:
+        messages (list[Message]): The list of messages to serialize.
+        summary_action (AgentSummarizeAction): The summary action to append.
+    """
+    # Ensure the logs directory exists
+    log_dir = Path('./logs')
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # Generate a timestamped filename
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    filename = f'debug_summary_{timestamp}.json'
+    file_path = log_dir / filename
+
+    try:
+        # Serialize messages using Pydantic's model_dump()
+        serialized_messages = [message.model_dump() for message in messages]
+
+        # Create a Message instance for the summary_action
+        summary_event = Message(
+            role='assistant', content=[TextContent(text=str(summary_action))]
+        )
+        serialized_summary = summary_event.model_dump()
+
+        # Append the serialized summary to the messages
+        serialized_messages.append(serialized_summary)
+
+        with file_path.open('w', encoding='utf-8') as f:
+            json.dump(serialized_messages, f, ensure_ascii=False, indent=4)
+
+        logger.debug(f'Messages successfully saved to {file_path}')
+    except Exception as e:
+        logger.error(f'Failed to save messages for debugging: {e}')


-@pytest.fixture
-def mock_llm():
-    return Mock(spec=LLM)
+def main(condenser: MemoryCondenser, file_path: str | None = None):
+    """
+    Main method for quick testing and debugging.
+    Reads a specified debug summary JSON file from the ./logs/deepseek-24sept directory,
+    deserializes the messages, and prints them.
+    If no file is specified, it falls back to the latest file based on timestamp.
+
+    Args:
+        file_path (str | None): The path to the log file to process. If None, the latest file is used.
+    """
+    log_dir = Path('./logs/deepseek-24sept')
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    if file_path:
+        target_log = Path(file_path)
+        if not target_log.exists():
+            print(f'Specified log file does not exist: {target_log}')
+            return
+    else:
+        log_files = list(log_dir.glob('instance_*_*.json'))
+
+        if not log_files:
+            print(
+                'No instance_*_*.json files found in the ./logs/deepseek-24sept directory.'
+            )
+            return
+
+        # Sort files to find the latest one based on the digits at the end of the filename
+        def extract_digits(file_path: Path) -> int:
+            try:
+                # Extract the digits part from the filename
+                digits_str = file_path.stem.split('_')[-1]
+                return int(digits_str)
+            except (IndexError, ValueError):
+                # If digit extraction fails, assign the lowest possible value
+                return -1
+
+        log_files.sort(key=extract_digits, reverse=True)
+        target_log = log_files[0]
+
+        print(f'Loading messages from: {target_log}')
+
+    try:
+        with target_log.open('r', encoding='utf-8') as f:
+            messages_data = json.load(f)
+
+            # convert string content to list of TextContent if necessary
+            for msg in messages_data:
+                if isinstance(msg['content'], str):
+                    msg['content'] = [{'type': 'text', 'text': msg['content']}]
+
+            messages: list[Message] = [
+                Message.model_validate(msg, strict=False) for msg in messages_data
+            ]
+
+            print(f'Successfully loaded {len(messages)} messages:')
+            # for msg in messages:
+            #    print(f'{msg.role}:\n {msg.content[50:]}')
+
+            # run condense on these messages
+            summary_action = condenser.condense(messages)
+            print(f'summary_action: {summary_action}')
+
+            # save the summary action to a file named with the same name as the log file + summary
+            summary_file_path = target_log.with_suffix('.summary.json')
+            with summary_file_path.open('w', encoding='utf-8') as f:
+                json.dump(summary_action.model_dump(), f, ensure_ascii=False, indent=4)
+
+    except Exception as e:
+        print(f'An error occurred while reading {target_log}: {e}')
+        return


-def test_condense_success(memory_condenser, mock_llm):
-    mock_llm.completion.return_value = {
-        'choices': [{'message': {'content': 'Condensed memory'}}]
-    }
-    result = memory_condenser.condense('Summarize this', mock_llm)
-    assert result == 'Condensed memory'
-    mock_llm.completion.assert_called_once_with(
-        messages=[{'content': 'Summarize this', 'role': 'user'}]
+if __name__ == '__main__':
+    # load or simulate dependencies as needed for testing
+    app_config = load_app_config()
+    llm_config = get_llm_config_arg('deepseek')
+    if llm_config is not None:
+        llm = LLM(config=llm_config)
+    else:
+        llm = LLM(app_config.get_llm_config('llm'))
+
+    prompt_dir = os.path.join(
+        os.path.dirname(__file__),
+        '..',
+        '..',
+        'openhands',
+        'agenthub',
+        'memcodeact_agent',
+        'prompts',
+    )
+    prompt_manager = PromptManager(
+        prompt_dir=prompt_dir,
+        agent_skills_docs='',
    )

+    condenser = MemoryCondenser(llm=llm, prompt_manager=prompt_manager)

-def test_condense_exception(memory_condenser, mock_llm):
-    mock_llm.completion.side_effect = LLMResponseError('LLM error')
-    with pytest.raises(LLMResponseError, match='LLM error'):
-        memory_condenser.condense('Summarize this', mock_llm)
+    # attach on fly the save_messages_for_debugging method to the condenser
+    condenser.save_messages_for_debugging = save_messages_for_debugging

-
-@patch('openhands.memory.condenser.logger')
-def test_condense_logs_error(mock_logger, memory_condenser, mock_llm):
-    mock_llm.completion.side_effect = LLMResponseError('LLM error')
-    with pytest.raises(LLMResponseError):
-        memory_condenser.condense('Summarize this', mock_llm)
-    mock_logger.error.assert_called_once_with(
-        'Error condensing thoughts: %s', 'LLM error', exc_info=False
+    # Setup argument parser for optional file parameter
+    parser = argparse.ArgumentParser(description='Run MemoryCondenser on a .json file.')
+    parser.add_argument(
+        '--file',
+        type=str,
+        default=None,
+        help='Path to the specific file to process. If not provided, the latest file is used.',
    )
+    args = parser.parse_args()
+
+    if args.file is not None and args.file == '':
+        args.file = None
+
+    # Call the main method with the specified file path if provided
+    main(condenser, file_path=args.file)
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -17,8 +17,6 @@ from openhands.events.observation.commands import IPythonRunCellObservation
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.stream import EventSource, EventStream
-from openhands.events.utils import get_pairs_from_events
-from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store


@@ -55,22 +53,21 @@ def event_stream(temp_dir):

 class TestStuckDetector:
    @pytest.fixture
-    def stuck_detector(self, event_stream):
+    def stuck_detector(self):
        state = State(inputs={}, max_iterations=50)
-        state.history.set_event_stream(event_stream)
-
+        state.history = []  # Initialize history as an empty list
        return StuckDetector(state)

    def _impl_syntax_error_events(
        self,
-        event_stream: EventStream,
+        state: State,
        error_message: str,
        random_line: bool,
        incidents: int = 4,
    ):
        for i in range(incidents):
            ipython_action = IPythonRunCellAction(code=code_snippet)
-            event_stream.add_event(ipython_action, EventSource.AGENT)
+            state.history.append(ipython_action)
            extra_number = (i + 1) * 10 if random_line else '42'
            extra_line = '\n' * (i + 1) if random_line else ''
            ipython_observation = IPythonRunCellObservation(
@@ -79,15 +76,15 @@ class TestStuckDetector:
                f'{error_message}{extra_line}' + jupyter_line_1 + jupyter_line_2,
                code=code_snippet,
            )
-            ipython_observation._cause = ipython_action._id
-            event_stream.add_event(ipython_observation, EventSource.USER)
+            # ipython_observation._cause = ipython_action._id
+            state.history.append(ipython_observation)

    def _impl_unterminated_string_error_events(
-        self, event_stream: EventStream, random_line: bool, incidents: int = 4
+        self, state: State, random_line: bool, incidents: int = 4
    ):
        for i in range(incidents):
            ipython_action = IPythonRunCellAction(code=code_snippet)
-            event_stream.add_event(ipython_action, EventSource.AGENT)
+            state.history.append(ipython_action)
            line_number = (i + 1) * 10 if random_line else '1'
            ipython_observation = IPythonRunCellObservation(
                content=f'print("  Cell In[1], line {line_number}\nhello\n       ^\nSyntaxError: unterminated string literal (detected at line {line_number})'
@@ -95,34 +92,30 @@ class TestStuckDetector:
                + jupyter_line_2,
                code=code_snippet,
            )
-            ipython_observation._cause = ipython_action._id
-            event_stream.add_event(ipython_observation, EventSource.USER)
+            # ipython_observation._cause = ipython_action._
+            state.history.append(ipython_observation)

-    def test_history_too_short(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_history_too_short(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        message_action = MessageAction(content='Hello', wait_for_response=False)
        message_action._source = EventSource.USER
        observation = NullObservation(content='')
-        observation._cause = message_action.id
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(observation, EventSource.USER)
+        # observation._cause = message_action.id
+        state.history.append(message_action)
+        state.history.append(observation)

        cmd_action = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action, EventSource.AGENT)
+        state.history.append(cmd_action)
        cmd_observation = CmdOutputObservation(
            command_id=1, command='ls', content='file1.txt\nfile2.txt'
        )
-        cmd_observation._cause = cmd_action._id
-        event_stream.add_event(cmd_observation, EventSource.USER)
-
-        # stuck_detector.state.history.set_event_stream(event_stream)
+        # cmd_observation._cause = cmd_action._id
+        state.history.append(cmd_observation)

        assert stuck_detector.is_stuck() is False

-    def test_is_stuck_repeating_action_observation(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        message_action = MessageAction(content='Done', wait_for_response=False)
        message_action._source = EventSource.USER

@@ -130,135 +123,125 @@ class TestStuckDetector:
        hello_observation = NullObservation('')

        # 2 events
-        event_stream.add_event(hello_action, EventSource.USER)
-        event_stream.add_event(hello_observation, EventSource.USER)
+        state.history.append(hello_action)
+        state.history.append(hello_observation)

        cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
-        cmd_observation_1 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_1._id
-        )
+        cmd_action_1._id = 1
+        state.history.append(cmd_action_1)
+        cmd_observation_1 = CmdOutputObservation(content='', command='ls', command_id=1)
        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        state.history.append(cmd_observation_1)
        # 4 events

        cmd_action_2 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
-        cmd_observation_2 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_2._id
-        )
+        cmd_action_2._id = 2
+        state.history.append(cmd_action_2)
+        cmd_observation_2 = CmdOutputObservation(content='', command='ls', command_id=2)
        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        state.history.append(cmd_observation_2)
        # 6 events

        # random user message just because we can
        message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
        # 8 events

        assert stuck_detector.is_stuck() is False
        assert stuck_detector.state.almost_stuck == 2

        cmd_action_3 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
-        cmd_observation_3 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_3._id
-        )
+        cmd_action_3._id = 3
+        state.history.append(cmd_action_3)
+        cmd_observation_3 = CmdOutputObservation(content='', command='ls', command_id=3)
        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        state.history.append(cmd_observation_3)
        # 10 events

-        assert len(collect_events(event_stream)) == 10
-        assert len(list(stuck_detector.state.history.get_events())) == 8
+        assert len(state.history) == 10
        assert (
-            len(
-                get_pairs_from_events(
-                    stuck_detector.state.history.get_events_as_list(
-                        include_delegates=True
-                    )
-                )
-            )
-            == 5
-        )
+            len(state.history) == 10
+        )  # Adjusted since history is a list and the controller is not running
+
+        # FIXME are we still testing this without this test?
+        # assert (
+        #    len(
+        #        get_pairs_from_events(state.history)
+        #    )
+        #    == 5
+        # )

        assert stuck_detector.is_stuck() is False
        assert stuck_detector.state.almost_stuck == 1

        cmd_action_4 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_4, EventSource.AGENT)
-        cmd_observation_4 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_4._id
-        )
+        cmd_action_4._id = 4
+        state.history.append(cmd_action_4)
+        cmd_observation_4 = CmdOutputObservation(content='', command='ls', command_id=4)
        cmd_observation_4._cause = cmd_action_4._id
-        event_stream.add_event(cmd_observation_4, EventSource.USER)
+        state.history.append(cmd_observation_4)
        # 12 events

-        assert len(collect_events(event_stream)) == 12
-        assert len(list(stuck_detector.state.history.get_events())) == 10
-        assert (
-            len(
-                get_pairs_from_events(
-                    stuck_detector.state.history.get_events_as_list(
-                        include_delegates=True
-                    )
-                )
-            )
-            == 6
-        )
+        assert len(state.history) == 12
+        # assert (
+        #    len(
+        #        get_pairs_from_events(state.history)
+        #    )
+        #    == 6
+        # )

        with patch('logging.Logger.warning') as mock_warning:
            assert stuck_detector.is_stuck() is True
            assert stuck_detector.state.almost_stuck == 0
            mock_warning.assert_called_once_with('Action, Observation loop detected')

-    def test_is_stuck_repeating_action_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_repeating_action_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        # (action, error_observation), not necessarily the same error
        message_action = MessageAction(content='Done', wait_for_response=False)
        message_action._source = EventSource.USER

        hello_action = MessageAction(content='Hello', wait_for_response=False)
        hello_observation = NullObservation(content='')
-        event_stream.add_event(hello_action, EventSource.USER)
-        hello_observation._cause = hello_action._id
-        event_stream.add_event(hello_observation, EventSource.USER)
+        state.history.append(hello_action)
+        # hello_observation._cause = hello_action._id
+        state.history.append(hello_observation)
        # 2 events

        cmd_action_1 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
        error_observation_1 = ErrorObservation(content='Command not found')
-        error_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(error_observation_1, EventSource.USER)
+        # error_observation_1._cause = cmd_action_1._id
+        state.history.append(error_observation_1)
        # 4 events

        cmd_action_2 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
        error_observation_2 = ErrorObservation(
            content='Command still not found or another error'
        )
-        error_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(error_observation_2, EventSource.USER)
+        # error_observation_2._cause = cmd_action_2._id
+        state.history.append(error_observation_2)
        # 6 events

        message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
        # 8 events

        cmd_action_3 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
        error_observation_3 = ErrorObservation(content='Different error')
-        error_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(error_observation_3, EventSource.USER)
+        # error_observation_3._cause = cmd_action_3._id
+        state.history.append(error_observation_3)
        # 10 events

        cmd_action_4 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_4, EventSource.AGENT)
+        state.history.append(cmd_action_4)
        error_observation_4 = ErrorObservation(content='Command not found')
-        error_observation_4._cause = cmd_action_4._id
-        event_stream.add_event(error_observation_4, EventSource.USER)
+        # error_observation_4._cause = cmd_action_4._id
+        state.history.append(error_observation_4)
        # 12 events

        with patch('logging.Logger.warning') as mock_warning:
@@ -267,11 +250,10 @@ class TestStuckDetector:
                'Action, ErrorObservation loop detected'
            )

-    def test_is_stuck_invalid_syntax_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_invalid_syntax_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        self._impl_syntax_error_events(
-            event_stream,
+            state,
            error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
            random_line=False,
        )
@@ -280,10 +262,11 @@ class TestStuckDetector:
            assert stuck_detector.is_stuck() is True

    def test_is_not_stuck_invalid_syntax_error_random_lines(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
+        state = stuck_detector.state
        self._impl_syntax_error_events(
-            event_stream,
+            state,
            error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
            random_line=True,
        )
@@ -292,10 +275,11 @@ class TestStuckDetector:
            assert stuck_detector.is_stuck() is False

    def test_is_not_stuck_invalid_syntax_error_only_three_incidents(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
+        state = stuck_detector.state
        self._impl_syntax_error_events(
-            event_stream,
+            state,
            error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
            random_line=True,
            incidents=3,
@@ -304,11 +288,10 @@ class TestStuckDetector:
        with patch('logging.Logger.warning'):
            assert stuck_detector.is_stuck() is False

-    def test_is_stuck_incomplete_input_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_incomplete_input_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        self._impl_syntax_error_events(
-            event_stream,
+            state,
            error_message='SyntaxError: incomplete input',
            random_line=False,
        )
@@ -316,11 +299,10 @@ class TestStuckDetector:
        with patch('logging.Logger.warning'):
            assert stuck_detector.is_stuck() is True

-    def test_is_not_stuck_incomplete_input_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_not_stuck_incomplete_input_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        self._impl_syntax_error_events(
-            event_stream,
+            state,
            error_message='SyntaxError: incomplete input',
            random_line=True,
        )
@@ -329,238 +311,239 @@ class TestStuckDetector:
            assert stuck_detector.is_stuck() is False

    def test_is_not_stuck_ipython_unterminated_string_error_random_lines(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
-        self._impl_unterminated_string_error_events(event_stream, random_line=True)
+        state = stuck_detector.state
+        self._impl_unterminated_string_error_events(state, random_line=True)

        with patch('logging.Logger.warning'):
            assert stuck_detector.is_stuck() is False

    def test_is_not_stuck_ipython_unterminated_string_error_only_three_incidents(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
+        state = stuck_detector.state
        self._impl_unterminated_string_error_events(
-            event_stream, random_line=False, incidents=3
+            state, random_line=False, incidents=3
        )

        with patch('logging.Logger.warning'):
            assert stuck_detector.is_stuck() is False

    def test_is_stuck_ipython_unterminated_string_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
-        self._impl_unterminated_string_error_events(event_stream, random_line=False)
+        state = stuck_detector.state
+        self._impl_unterminated_string_error_events(state, random_line=False)

        with patch('logging.Logger.warning'):
            assert stuck_detector.is_stuck() is True

    def test_is_not_stuck_ipython_syntax_error_not_at_end(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
+        state = stuck_detector.state
        # this test is to make sure we don't get false positives
        # since the "at line x" is changing in between!
        ipython_action_1 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_1, EventSource.AGENT)
+        state.history.append(ipython_action_1)
        ipython_observation_1 = IPythonRunCellObservation(
            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nThis is some additional output',
            code='print("hello',
        )
-        ipython_observation_1._cause = ipython_action_1._id
-        event_stream.add_event(ipython_observation_1, EventSource.USER)
+        # ipython_observation_1._cause = ipython_action_1._id
+        state.history.append(ipython_observation_1)

        ipython_action_2 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_2, EventSource.AGENT)
+        state.history.append(ipython_action_2)
        ipython_observation_2 = IPythonRunCellObservation(
            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nToo much output here on and on',
            code='print("hello',
        )
-        ipython_observation_2._cause = ipython_action_2._id
-        event_stream.add_event(ipython_observation_2, EventSource.USER)
+        # ipython_observation_2._cause = ipython_action_2._id
+        state.history.append(ipython_observation_2)

        ipython_action_3 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_3, EventSource.AGENT)
+        state.history.append(ipython_action_3)
        ipython_observation_3 = IPythonRunCellObservation(
            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 3)\nEnough',
            code='print("hello',
        )
-        ipython_observation_3._cause = ipython_action_3._id
-        event_stream.add_event(ipython_observation_3, EventSource.USER)
+        # ipython_observation_3._cause = ipython_action_3._id
+        state.history.append(ipython_observation_3)

        ipython_action_4 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_4, EventSource.AGENT)
+        state.history.append(ipython_action_4)
        ipython_observation_4 = IPythonRunCellObservation(
            content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 2)\nLast line of output',
            code='print("hello',
        )
-        ipython_observation_4._cause = ipython_action_4._id
-        event_stream.add_event(ipython_observation_4, EventSource.USER)
+        # ipython_observation_4._cause = ipython_action_4._id
+        state.history.append(ipython_observation_4)

        with patch('logging.Logger.warning') as mock_warning:
            assert stuck_detector.is_stuck() is False
            mock_warning.assert_not_called()

    def test_is_stuck_repeating_action_observation_pattern(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
    ):
+        state = stuck_detector.state
        message_action = MessageAction(content='Come on', wait_for_response=False)
        message_action._source = EventSource.USER
-        event_stream.add_event(message_action, EventSource.USER)
+        state.history.append(message_action)
        message_observation = NullObservation(content='')
-        event_stream.add_event(message_observation, EventSource.USER)
+        state.history.append(message_observation)

        cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
        cmd_observation_1 = CmdOutputObservation(
            command_id=1, command='ls', content='file1.txt\nfile2.txt'
        )
-        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # cmd_observation_1._cause = cmd_action_1._id
+        state.history.append(cmd_observation_1)

        read_action_1 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_1, EventSource.AGENT)
+        state.history.append(read_action_1)
        read_observation_1 = FileReadObservation(
            content='File content', path='file1.txt'
        )
-        read_observation_1._cause = read_action_1._id
-        event_stream.add_event(read_observation_1, EventSource.USER)
+        # read_observation_1._cause = read_action_1._id
+        state.history.append(read_observation_1)

        cmd_action_2 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
        cmd_observation_2 = CmdOutputObservation(
            command_id=2, command='ls', content='file1.txt\nfile2.txt'
        )
-        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # cmd_observation_2._cause = cmd_action_2._id
+        state.history.append(cmd_observation_2)

        read_action_2 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_2, EventSource.AGENT)
+        state.history.append(read_action_2)
        read_observation_2 = FileReadObservation(
            content='File content', path='file1.txt'
        )
-        read_observation_2._cause = read_action_2._id
-        event_stream.add_event(read_observation_2, EventSource.USER)
+        # read_observation_2._cause = read_action_2._id
+        state.history.append(read_observation_2)

        # one more message to break the pattern
        message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)

        cmd_action_3 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
        cmd_observation_3 = CmdOutputObservation(
            command_id=3, command='ls', content='file1.txt\nfile2.txt'
        )
-        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # cmd_observation_3._cause = cmd_action_3._id
+        state.history.append(cmd_observation_3)

        read_action_3 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_3, EventSource.AGENT)
+        state.history.append(read_action_3)
        read_observation_3 = FileReadObservation(
            content='File content', path='file1.txt'
        )
-        read_observation_3._cause = read_action_3._id
-        event_stream.add_event(read_observation_3, EventSource.USER)
+        # read_observation_3._cause = read_action_3._id
+        state.history.append(read_observation_3)

        with patch('logging.Logger.warning') as mock_warning:
            assert stuck_detector.is_stuck() is True
            mock_warning.assert_called_once_with('Action, Observation pattern detected')

-    def test_is_stuck_not_stuck(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
        message_action = MessageAction(content='Done', wait_for_response=False)
        message_action._source = EventSource.USER

        hello_action = MessageAction(content='Hello', wait_for_response=False)
-        event_stream.add_event(hello_action, EventSource.USER)
+        state.history.append(hello_action)
        hello_observation = NullObservation(content='')
-        hello_observation._cause = hello_action._id
-        event_stream.add_event(hello_observation, EventSource.USER)
+        # hello_observation._cause = hello_action._id
+        state.history.append(hello_observation)

        cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
        cmd_observation_1 = CmdOutputObservation(
            command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt'
        )
-        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # cmd_observation_1._cause = cmd_action_1._id
+        state.history.append(cmd_observation_1)

        read_action_1 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_1, EventSource.AGENT)
+        state.history.append(read_action_1)
        read_observation_1 = FileReadObservation(
            content='File content', path='file1.txt'
        )
-        read_observation_1._cause = read_action_1._id
-        event_stream.add_event(read_observation_1, EventSource.USER)
+        # read_observation_1._cause = read_action_1._id
+        state.history.append(read_observation_1)

        cmd_action_2 = CmdRunAction(command='pwd')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
        cmd_observation_2 = CmdOutputObservation(
            command_id=2, command='pwd', content='/home/user'
        )
-        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # cmd_observation_2._cause = cmd_action_2._id
+        state.history.append(cmd_observation_2)

        read_action_2 = FileReadAction(path='file2.txt')
-        event_stream.add_event(read_action_2, EventSource.AGENT)
+        state.history.append(read_action_2)
        read_observation_2 = FileReadObservation(
            content='Another file content', path='file2.txt'
        )
-        read_observation_2._cause = read_action_2._id
-        event_stream.add_event(read_observation_2, EventSource.USER)
+        # read_observation_2._cause = read_action_2._id
+        state.history.append(read_observation_2)

        message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)

        cmd_action_3 = CmdRunAction(command='pwd')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
        cmd_observation_3 = CmdOutputObservation(
            command_id=cmd_action_3.id, command='pwd', content='/home/user'
        )
-        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # cmd_observation_3._cause = cmd_action_3._id
+        state.history.append(cmd_observation_3)

        read_action_3 = FileReadAction(path='file2.txt')
-        event_stream.add_event(read_action_3, EventSource.AGENT)
+        state.history.append(read_action_3)
        read_observation_3 = FileReadObservation(
            content='Another file content', path='file2.txt'
        )
-        read_observation_3._cause = read_action_3._id
-        event_stream.add_event(read_observation_3, EventSource.USER)
+        # read_observation_3._cause = read_action_3._id
+        state.history.append(read_observation_3)

        assert stuck_detector.is_stuck() is False

-    def test_is_stuck_monologue(self, stuck_detector, event_stream):
-        # Add events to the event stream
+    def test_is_stuck_monologue(self, stuck_detector):
+        state = stuck_detector.state
+        # Add events to the history list directly
        message_action_1 = MessageAction(content='Hi there!')
-        event_stream.add_event(message_action_1, EventSource.USER)
        message_action_1._source = EventSource.USER
-
+        state.history.append(message_action_1)
        message_action_2 = MessageAction(content='Hi there!')
-        event_stream.add_event(message_action_2, EventSource.AGENT)
        message_action_2._source = EventSource.AGENT
-
+        state.history.append(message_action_2)
        message_action_3 = MessageAction(content='How are you?')
-        event_stream.add_event(message_action_3, EventSource.USER)
        message_action_3._source = EventSource.USER
+        state.history.append(message_action_3)

        cmd_kill_action = CmdRunAction(
            command='echo 42', thought="I'm not stuck, he's stuck"
        )
-        event_stream.add_event(cmd_kill_action, EventSource.AGENT)
+        state.history.append(cmd_kill_action)

        message_action_4 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_4, EventSource.AGENT)
        message_action_4._source = EventSource.AGENT
-
+        state.history.append(message_action_4)
        message_action_5 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_5, EventSource.AGENT)
        message_action_5._source = EventSource.AGENT
-
+        state.history.append(message_action_5)
        message_action_6 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_6, EventSource.AGENT)
        message_action_6._source = EventSource.AGENT
+        state.history.append(message_action_6)

        assert stuck_detector.is_stuck()

@@ -571,16 +554,15 @@ class TestStuckDetector:
            command='storybook',
            exit_code=0,
        )
-        cmd_output_observation._cause = cmd_kill_action._id
-        event_stream.add_event(cmd_output_observation, EventSource.USER)
+        # cmd_output_observation._cause = cmd_kill_action._id
+        state.history.append(cmd_output_observation)

        message_action_7 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_7, EventSource.AGENT)
        message_action_7._source = EventSource.AGENT
-
+        state.history.append(message_action_7)
        message_action_8 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_8, EventSource.AGENT)
        message_action_8._source = EventSource.AGENT
+        state.history.append(message_action_8)

        with patch('logging.Logger.warning'):
            assert not stuck_detector.is_stuck()
@@ -595,7 +577,6 @@ class TestAgentController:
        )
        controller.delegate = None
        controller.state = Mock()
-        controller.state.history = ShortTermHistory()
        return controller

    def test_is_stuck_delegate_stuck(self, controller: AgentController):
--- a/tests/unit/test_llm_config.py
+++ b/tests/unit/test_llm_config.py
@@ -0,0 +1,228 @@
+import pathlib
+
+import pytest
+
+from openhands.core.config import AppConfig
+from openhands.core.config.utils import load_from_toml
+
+
+@pytest.fixture
+def default_config(monkeypatch):
+    # Fixture to provide a default AppConfig instance
+    yield AppConfig()
+
+
+@pytest.fixture
+def generic_llm_toml(tmp_path: pathlib.Path) -> str:
+    """Fixture to create a generic LLM TOML configuration with all custom LLMs
+    providing mandatory 'model' and 'api_key', and testing fallback to the generic section values
+    for other attributes like 'num_retries'.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+embedding_model = "base-embedding"
+num_retries = 3
+
+[llm.custom1]
+model = "custom-model-1"
+api_key = "custom-api-key-1"
+# 'num_retries' is not overridden and should fallback to the value from [llm]
+
+[llm.custom2]
+model = "custom-model-2"
+api_key = "custom-api-key-2"
+num_retries = 5  # Overridden value
+
+[llm.custom3]
+model = "custom-model-3"
+api_key = "custom-api-key-3"
+# No overrides for additional attributes
+    """
+    toml_file = tmp_path / 'llm_config.toml'
+    toml_file.write_text(toml_content)
+    return str(toml_file)
+
+
+def test_load_from_toml_llm_with_fallback(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations fallback non-overridden attributes
+    like 'num_retries' from the generic [llm] section.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify generic LLM configuration
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.embedding_model == 'base-embedding'
+    assert generic_llm.num_retries == 3
+
+    # Verify custom1 LLM falls back 'num_retries' from base
+    custom1 = default_config.get_llm_config('custom1')
+    assert custom1.model == 'custom-model-1'
+    assert custom1.api_key == 'custom-api-key-1'
+    assert custom1.embedding_model == 'base-embedding'
+    assert custom1.num_retries == 3  # from [llm]
+
+    # Verify custom2 LLM overrides 'num_retries'
+    custom2 = default_config.get_llm_config('custom2')
+    assert custom2.model == 'custom-model-2'
+    assert custom2.api_key == 'custom-api-key-2'
+    assert custom2.embedding_model == 'base-embedding'
+    assert custom2.num_retries == 5  # overridden value
+
+    # Verify custom3 LLM inherits all attributes except 'model' and 'api_key'
+    custom3 = default_config.get_llm_config('custom3')
+    assert custom3.model == 'custom-model-3'
+    assert custom3.api_key == 'custom-api-key-3'
+    assert custom3.embedding_model == 'base-embedding'
+    assert custom3.num_retries == 3  # from [llm]
+
+
+def test_load_from_toml_llm_custom_overrides_all(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that a custom LLM can fully override all attributes from the generic [llm] section."""
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+embedding_model = "base-embedding"
+num_retries = 3
+
+[llm.custom_full]
+model = "full-custom-model"
+api_key = "full-custom-api-key"
+embedding_model = "full-custom-embedding"
+num_retries = 10
+    """
+    toml_file = tmp_path / 'full_override_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify generic LLM configuration remains unchanged
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.embedding_model == 'base-embedding'
+    assert generic_llm.num_retries == 3
+
+    # Verify custom_full LLM overrides all attributes
+    custom_full = default_config.get_llm_config('custom_full')
+    assert custom_full.model == 'full-custom-model'
+    assert custom_full.api_key == 'full-custom-api-key'
+    assert custom_full.embedding_model == 'full-custom-embedding'
+    assert custom_full.num_retries == 10  # overridden value
+
+
+def test_load_from_toml_llm_custom_partial_override(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations can partially override attributes
+    from the generic [llm] section while inheriting others.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify custom1 LLM overrides 'model' and 'api_key' but inherits 'num_retries'
+    custom1 = default_config.get_llm_config('custom1')
+    assert custom1.model == 'custom-model-1'
+    assert custom1.api_key == 'custom-api-key-1'
+    assert custom1.embedding_model == 'base-embedding'
+    assert custom1.num_retries == 3  # from [llm]
+
+    # Verify custom2 LLM overrides 'model', 'api_key', and 'num_retries'
+    custom2 = default_config.get_llm_config('custom2')
+    assert custom2.model == 'custom-model-2'
+    assert custom2.api_key == 'custom-api-key-2'
+    assert custom2.embedding_model == 'base-embedding'
+    assert custom2.num_retries == 5  # Overridden value
+
+
+def test_load_from_toml_llm_custom_no_override(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations with no additional overrides
+    inherit all non-specified attributes from the generic [llm] section.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify custom3 LLM inherits 'embedding_model' and 'num_retries' from generic
+    custom3 = default_config.get_llm_config('custom3')
+    assert custom3.model == 'custom-model-3'
+    assert custom3.api_key == 'custom-api-key-3'
+    assert custom3.embedding_model == 'base-embedding'
+    assert custom3.num_retries == 3  # from [llm]
+
+
+def test_load_from_toml_llm_missing_generic(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that custom LLM configurations without a generic [llm] section
+    use only their own attributes and fallback to defaults for others.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm.custom_only]
+model = "custom-only-model"
+api_key = "custom-only-api-key"
+    """
+    toml_file = tmp_path / 'custom_only_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify custom_only LLM uses its own attributes and defaults for others
+    custom_only = default_config.get_llm_config('custom_only')
+    assert custom_only.model == 'custom-only-model'
+    assert custom_only.api_key == 'custom-only-api-key'
+    assert custom_only.embedding_model == 'local'  # default value
+    assert custom_only.num_retries == 8  # default value
+
+
+def test_load_from_toml_llm_invalid_config(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that invalid custom LLM configurations do not override the generic
+    and raise appropriate warnings.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+num_retries = 3
+
+[llm.invalid_custom]
+unknown_attr = "should_not_exist"
+    """
+    toml_file = tmp_path / 'invalid_custom_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify generic LLM is loaded correctly
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.num_retries == 3
+
+    # Verify invalid_custom LLM does not override generic attributes
+    custom_invalid = default_config.get_llm_config('invalid_custom')
+    assert custom_invalid.model == 'base-model'
+    assert custom_invalid.api_key == 'base-api-key'
+    assert custom_invalid.num_retries == 3  # default value
+    assert custom_invalid.embedding_model == 'local'  # default value
--- a/tests/unit/test_micro_agents.py
+++ b/tests/unit/test_micro_agents.py
@@ -10,10 +10,8 @@ from openhands.agenthub.micro.registry import all_microagents
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.events import EventSource
 from openhands.events.action import MessageAction
 from openhands.events.stream import EventStream
-from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store


@@ -74,10 +72,10 @@ def test_coder_agent_with_summary(event_stream: EventStream, agent_configs: dict
    )
    assert coder_agent is not None

+    # give it some history
    task = 'This is a dummy task'
-    history = ShortTermHistory()
-    history.set_event_stream(event_stream)
-    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+    history = list()
+    history.append(MessageAction(content=task))

    summary = 'This is a dummy summary about this repo'
    state = State(history=history, inputs={'summary': summary})
@@ -119,10 +117,10 @@ def test_coder_agent_without_summary(event_stream: EventStream, agent_configs: d
    )
    assert coder_agent is not None

+    # give it some history
    task = 'This is a dummy task'
-    history = ShortTermHistory()
-    history.set_event_stream(event_stream)
-    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+    history = list()
+    history.append(MessageAction(content=task))

    # set state without codebase summary
    state = State(history=history)
--- a/tests/unit/test_prompt_caching.py
+++ b/tests/unit/test_prompt_caching.py
@@ -1,14 +1,12 @@
-from unittest.mock import MagicMock, Mock, patch
+from unittest.mock import Mock, patch

 import pytest

 from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
 from openhands.core.config import AgentConfig, LLMConfig
-from openhands.events import EventSource, EventStream
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.llm.llm import LLM
-from openhands.storage import get_file_store


@pytest.fixture
@@ -19,29 +17,34 @@ def mock_llm():
    return llm


-@pytest.fixture
-def mock_event_stream(tmp_path):
-    file_store = get_file_store('local', str(tmp_path))
-    return EventStream('test_session', file_store)
-
-
@pytest.fixture
 def codeact_agent(mock_llm):
    config = AgentConfig()
    return CodeActAgent(mock_llm, config)


-def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
-    # Add some events to the stream
-    mock_event_stream.add_event(MessageAction('Initial user message'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Sure!'), EventSource.AGENT)
-    mock_event_stream.add_event(MessageAction('Hello, agent!'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Hello, user!'), EventSource.AGENT)
-    mock_event_stream.add_event(MessageAction('Laaaaaaaast!'), EventSource.USER)
+def test_get_messages_with_reminder(codeact_agent: CodeActAgent):
+    # Add some events to history
+    history = list()
+    message_action_1 = MessageAction('Initial user message')
+    message_action_1._source = 'user'
+    history.append(message_action_1)
+    message_action_2 = MessageAction('Sure!')
+    message_action_2._source = 'assistant'
+    history.append(message_action_2)
+    message_action_3 = MessageAction('Hello, agent!')
+    message_action_3._source = 'user'
+    history.append(message_action_3)
+    message_action_4 = MessageAction('Hello, user!')
+    message_action_4._source = 'assistant'
+    history.append(message_action_4)
+    message_action_5 = MessageAction('Laaaaaaaast!')
+    message_action_5._source = 'user'
+    history.append(message_action_5)

    codeact_agent.reset()
    messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=5, iteration=0)
+        Mock(history=history, max_iterations=5, iteration=0)
    )

    assert (
@@ -71,19 +74,20 @@ def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
    )


-def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
+def test_get_messages_prompt_caching(codeact_agent: CodeActAgent):
+    history = list()
    # Add multiple user and agent messages
    for i in range(15):
-        mock_event_stream.add_event(
-            MessageAction(f'User message {i}'), EventSource.USER
-        )
-        mock_event_stream.add_event(
-            MessageAction(f'Agent message {i}'), EventSource.AGENT
-        )
+        message_action_user = MessageAction(f'User message {i}')
+        message_action_user._source = 'user'
+        history.append(message_action_user)
+        message_action_agent = MessageAction(f'Agent message {i}')
+        message_action_agent._source = 'assistant'
+        history.append(message_action_agent)

    codeact_agent.reset()
    messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=10, iteration=5)
+        Mock(history=history, max_iterations=10, iteration=5)
    )

    # Check that only the last two user messages have cache_prompt=True
@@ -104,15 +108,19 @@ def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
    assert cached_user_messages[3].content[0].text.startswith('User message 1')


-def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
+def test_get_messages_with_cmd_action(codeact_agent: CodeActAgent):
+    history = list()
    # Add a mix of actions and observations
    message_action_1 = MessageAction(
        "Let's list the contents of the current directory."
    )
-    mock_event_stream.add_event(message_action_1, EventSource.USER)
+    message_action_1._source = 'user'
+    history.append(message_action_1)

    cmd_action_1 = CmdRunAction('ls -l', thought='List files in current directory')
-    mock_event_stream.add_event(cmd_action_1, EventSource.AGENT)
+    cmd_action_1._source = 'agent'
+    cmd_action_1._id = 'cmd_1'
+    history.append(cmd_action_1)

    cmd_observation_1 = CmdOutputObservation(
        content='total 0\n-rw-r--r-- 1 user group 0 Jan 1 00:00 file1.txt\n-rw-r--r-- 1 user group 0 Jan 1 00:00 file2.txt',
@@ -120,13 +128,17 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
        command='ls -l',
        exit_code=0,
    )
-    mock_event_stream.add_event(cmd_observation_1, EventSource.USER)
+    cmd_observation_1._source = 'user'
+    history.append(cmd_observation_1)

    message_action_2 = MessageAction("Now, let's create a new directory.")
-    mock_event_stream.add_event(message_action_2, EventSource.AGENT)
+    message_action_2._source = 'agent'
+    history.append(message_action_2)

    cmd_action_2 = CmdRunAction('mkdir new_directory', thought='Create a new directory')
-    mock_event_stream.add_event(cmd_action_2, EventSource.AGENT)
+    cmd_action_2._source = 'agent'
+    cmd_action_2._id = 'cmd_2'
+    history.append(cmd_action_2)

    cmd_observation_2 = CmdOutputObservation(
        content='',
@@ -134,11 +146,12 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
        command='mkdir new_directory',
        exit_code=0,
    )
-    mock_event_stream.add_event(cmd_observation_2, EventSource.USER)
+    cmd_observation_2._source = 'user'
+    history.append(cmd_observation_2)

    codeact_agent.reset()
    messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=5, iteration=0)
+        Mock(history=history, max_iterations=5, iteration=0)
    )

    # Assert the presence of key elements in the messages
@@ -180,16 +193,14 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
    assert 'ENVIRONMENT REMINDER: You have 5 turns' in messages[5].content[1].text


-def test_prompt_caching_headers(codeact_agent, mock_event_stream):
+def test_prompt_caching_headers(codeact_agent: CodeActAgent):
+    history = list()
    # Setup
-    mock_event_stream.add_event(MessageAction('Hello, agent!'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Hello, user!'), EventSource.AGENT)
-
-    mock_short_term_history = MagicMock()
-    mock_short_term_history.get_last_user_message.return_value = 'Hello, agent!'
+    history.append(MessageAction('Hello, agent!'))
+    history.append(MessageAction('Hello, user!'))

    mock_state = Mock()
-    mock_state.history = mock_short_term_history
+    mock_state.history = history
    mock_state.max_iterations = 5
    mock_state.iteration = 0

--- a/tests/unit/test_prompt_manager.py
+++ b/tests/unit/test_prompt_manager.py
@@ -14,7 +14,7 @@ def prompt_dir(tmp_path):
    shutil.copytree('openhands/agenthub/codeact_agent', tmp_path, dirs_exist_ok=True)

    # Return the temporary directory path
-    return tmp_path
+    return str(tmp_path)  # Return string path


 SAMPLE_AGENT_SKILLS_DOCS = """Sample agent skills documentation"""
@@ -26,10 +26,10 @@ def agent_skills_docs():


 def test_prompt_manager_without_micro_agent(prompt_dir, agent_skills_docs):
-    manager = PromptManager(prompt_dir, agent_skills_docs)
+    manager = PromptManager(prompt_dir)

    assert manager.prompt_dir == prompt_dir
-    assert manager.agent_skills_docs == agent_skills_docs
+    # assert manager.agent_skills_docs == agent_skills_docs
    assert manager.micro_agent is None

    assert isinstance(manager.system_message, str)
@@ -37,7 +37,7 @@ def test_prompt_manager_without_micro_agent(prompt_dir, agent_skills_docs):
        "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions."
        in manager.system_message
    )
-    assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
+    # assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
    assert isinstance(manager.initial_user_message, str)
    assert '--- BEGIN OF GUIDELINE ---' not in manager.initial_user_message
    assert '--- END OF GUIDELINE ---' not in manager.initial_user_message
@@ -64,12 +64,11 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):

    manager = PromptManager(
        prompt_dir=prompt_dir,
-        agent_skills_docs=agent_skills_docs,
        micro_agent=mock_micro_agent,
    )

    assert manager.prompt_dir == prompt_dir
-    assert manager.agent_skills_docs == agent_skills_docs
+    # assert manager.agent_skills_docs == agent_skills_docs
    assert manager.micro_agent == mock_micro_agent

    assert isinstance(manager.system_message, str)
@@ -77,7 +76,7 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
        "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions."
        in manager.system_message
    )
-    assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
+    # assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message

    assert isinstance(manager.initial_user_message, str)
    assert (
@@ -106,11 +105,19 @@ def test_prompt_manager_template_rendering(prompt_dir, agent_skills_docs):
    with open(os.path.join(prompt_dir, 'user_prompt.j2'), 'w') as f:
        f.write('User prompt: {{ micro_agent }}')

-    manager = PromptManager(prompt_dir, agent_skills_docs)
+    manager = PromptManager(prompt_dir)

-    assert manager.system_message == f'System prompt: {agent_skills_docs}'
+    # assert manager.system_message == f'System prompt: {agent_skills_docs}'
    assert manager.initial_user_message == 'User prompt: None'

    # Clean up temporary files
    os.remove(os.path.join(prompt_dir, 'system_prompt.j2'))
    os.remove(os.path.join(prompt_dir, 'user_prompt.j2'))
+
+
+def test_prompt_manager_loads_agent_skill(prompt_dir):
+    manager = PromptManager(prompt_dir)
+    assert (
+        'open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None'
+        in manager.system_message
+    )
Author	SHA1	Message	Date
Engel Nyst	4d05ab1059	update summarize prompt	2024-10-31 12:11:39 +01:00
Engel Nyst	63284d39f8	poetry lock	2024-10-27 09:09:16 +01:00
Engel Nyst	41c03ad828	Merge branch 'enyst/eventstream-state' into enyst/memory-agent	2024-10-27 09:07:46 +01:00
Engel Nyst	a213c65b8a	make extra sure we have a valid start	2024-10-27 09:02:01 +01:00
Engel Nyst	ebeab75373	clean up verbose log	2024-10-27 08:27:46 +01:00
Engel Nyst	f42cbedfe3	fix tests	2024-10-27 08:14:20 +01:00
Engel Nyst	f53e1cf118	set delegates start explicitly; minor tweaks	2024-10-27 07:49:32 +01:00
Engel Nyst	cfc158df75	Merge branch 'main' of github.com:All-Hands-AI/OpenHands into enyst/eventstream-state	2024-10-27 06:24:02 +01:00
Engel Nyst	93cfd323a2	tweak init/restore	2024-10-27 06:20:41 +01:00
Engel Nyst	94c68be15f	save/restore state automatically	2024-10-27 04:16:36 +01:00
Engel Nyst	34e0f8a882	remove script that got here by accident	2024-10-27 03:47:20 +01:00
Engel Nyst	04b6d70c25	init history from the event stream	2024-10-27 01:55:54 +02:00
Engel Nyst	9af6e5ef0e	not worth caching delegates if only used once or twice per session	2024-10-26 02:36:56 +02:00
Engel Nyst	c6a9028916	init history for restored state	2024-10-26 02:34:59 +02:00
Engel Nyst	99a257ca71	Merge branch 'main' of github.com:All-Hands-AI/OpenHands into enyst/eventstream-state	2024-10-26 01:57:15 +02:00
Engel Nyst	d4d3aa0134	Merge branch 'enyst/eventstream-state' into enyst/memory-agent	2024-10-25 02:53:25 +02:00
Engel Nyst	54f60acc0b	clean up obsolete config var - sessions are always saved if filestore is appropriate	2024-10-25 02:50:53 +02:00
Engel Nyst	11d82f238d	save events as they happen	2024-10-25 02:49:26 +02:00
Engel Nyst	ad0b9b28f1	Merge branch 'main' of github.com:All-Hands-AI/OpenHands into enyst/eventstream-state	2024-10-25 01:47:23 +02:00
Engel Nyst	274ad619a2	Merge branch 'enyst/refactor_template' into enyst/memory-agent	2024-10-24 14:12:13 +02:00
Engel Nyst	6f282b90da	fix user prompt; bad coverage	2024-10-24 14:11:39 +02:00
Engel Nyst	4efcc02776	ruff	2024-10-24 14:09:48 +02:00
Engel Nyst	6141d0b3cf	Merge branch 'enyst/refactor_template' into enyst/memory-agent	2024-10-24 13:54:18 +02:00
Engel Nyst	1df7aaa0cf	add user-defined template directory	2024-10-24 13:10:05 +02:00
Engel Nyst	ada2ebd4f5	tweak agent skill display	2024-10-24 12:16:29 +02:00
Engel Nyst	6732359894	strange leftover from another branch	2024-10-24 11:36:08 +02:00
Engel Nyst	bf9b8acbab	kill some whitespace	2024-10-24 11:32:50 +02:00
Engel Nyst	e2c343a733	fix useless vars	2024-10-24 11:22:20 +02:00
Engel Nyst	bbd5211c3b	remove obsolete md	2024-10-24 11:18:18 +02:00
Engel Nyst	9629a73391	fix template loading	2024-10-24 11:09:33 +02:00
Engel Nyst	7930457211	create examples template	2024-10-24 10:03:03 +02:00
Engel Nyst	5df104dcb2	break down agent skills	2024-10-24 09:33:53 +02:00
Engel Nyst	e75a489de9	add agent skills and yaml	2024-10-24 08:02:23 +02:00
Engel Nyst	b93c81869a	tweak template	2024-10-24 07:06:04 +02:00
Engel Nyst	106bbb5ca4	ruff	2024-10-24 03:17:19 +02:00
Engel Nyst	1d582ac100	fix tokenizer	2024-10-24 01:58:17 +02:00
Engel Nyst	a858083d50	break down prompts; tweak core memory; rewrite algo	2024-10-24 01:44:20 +02:00
Engel Nyst	81b19c268c	add script for testing, clean up obsolete content	2024-10-22 09:03:55 +02:00
Engel Nyst	6f9c922cd7	remove eval script	2024-10-22 08:34:25 +02:00
Engel Nyst	2a448f212c	adapt action, prompt, some clean up logic	2024-10-22 07:55:47 +02:00
Engel Nyst	a25a867ed5	add tokenizer from HF	2024-10-22 06:33:41 +02:00
Engel Nyst	a060cbb882	fix condensation; add debugging	2024-10-22 04:41:24 +02:00
Engel Nyst	b631e53455	fixes; debugging test	2024-10-22 00:32:29 +02:00
Engel Nyst	083edd4444	configurations wip	2024-10-22 00:30:01 +02:00
Engel Nyst	374243182e	fix parser (o1 !!)	2024-10-21 20:04:59 +02:00
Engel Nyst	5ad9ef4d7b	fix leftover calls	2024-10-21 03:10:52 +02:00
Engel Nyst	9ac47bff8e	tweak prompts	2024-10-21 02:38:56 +02:00
Engel Nyst	67693a5e9c	core memory split	2024-10-21 02:23:42 +02:00
Engel Nyst	84428411b3	fix template include	2024-10-21 02:17:04 +02:00
Engel Nyst	fcdfb19f60	add voyage ai embeddings	2024-10-21 02:12:58 +02:00
Engel Nyst	11b3242746	fix var, run all stream embeddings on llama-index	2024-10-21 01:29:23 +02:00
Engel Nyst	f4ecd3a85b	add litellm embeddings for testing	2024-10-21 01:28:26 +02:00
Engel Nyst	53f7a78e9d	fix schemas, utils	2024-10-21 01:27:42 +02:00
Engel Nyst	16da4e222c	unit tests	2024-10-20 21:51:48 +02:00
Engel Nyst	1bf2d082bb	fix llm_config fallback	2024-10-20 21:51:32 +02:00
Engel Nyst	10293e60b8	add these actions to history; in-context example	2024-10-20 20:57:52 +02:00
Engel Nyst	aad59fc3ac	fix update	2024-10-20 19:43:16 +02:00
Engel Nyst	fb904590aa	summarize and recall	2024-10-20 19:31:39 +02:00
Engel Nyst	143f16d19f	add strings	2024-10-20 03:54:43 +02:00
Engel Nyst	d36917bc5a	tweak prompt	2024-10-20 03:53:35 +02:00
khushvind	20c9fa89c2	added summary prompt	2024-10-20 03:51:24 +02:00
khushvind	b77961b057	added summary response	2024-10-20 03:37:41 +02:00
Engel Nyst	e04f77a8f8	add action parser	2024-10-20 03:36:32 +02:00
Engel Nyst	2353c304fb	tweak prompts	2024-10-20 03:21:42 +02:00
Engel Nyst	4aedbc283e	clean up duplicate	2024-10-20 02:58:48 +02:00
khushvind	225d3302e1	added summarizer	2024-10-20 02:04:47 +02:00
Engel Nyst	5e572dbb18	tweaks to types	2024-10-20 01:47:56 +02:00
Engel Nyst	bf8412aa72	fix prompting	2024-10-20 01:38:08 +02:00
Engel Nyst	386b83549c	rename, delete module we won't use	2024-10-20 00:00:35 +02:00
Engel Nyst	7affbfd3fe	add prompts	2024-10-19 23:58:57 +02:00
Engel Nyst	c235d610ee	fix objects	2024-10-19 23:57:02 +02:00
Engel Nyst	cb60751642	fix imports	2024-10-19 21:03:52 +02:00
Engel Nyst	34a7b7098e	set user message	2024-10-19 21:00:25 +02:00
Engel Nyst	9e1cdcf30d	Merge branch 'enyst/eventstream-state' into memory-agent	2024-10-19 20:55:27 +02:00
Engel Nyst	7b0a8355ab	add get_last_user_message	2024-10-18 23:44:27 +02:00
Engel Nyst	df3f0b6120	wip add memory modules	2024-10-18 03:35:27 +02:00
Engel Nyst	611d0e49d8	use event.id in memory, fix merge	2024-10-17 23:04:27 +02:00
Engel Nyst	fccc9f764c	Merge branch 'enyst/eventstream-state' into memory-agent	2024-10-17 22:10:02 +02:00
Engel Nyst	696f5d1e92	fix merge	2024-10-17 21:08:55 +02:00
Engel Nyst	526190ceea	Merge branch 'main' of github.com:All-Hands-AI/OpenHands into enyst/eventstream-state	2024-10-17 20:53:53 +02:00
Engel Nyst	5f19a7cbb4	create a delegate obs when the delegate ends with an error	2024-10-17 20:02:41 +02:00
Engel Nyst	21ede6d9e9	fix delegate exclusion	2024-10-17 19:30:23 +02:00
Engel Nyst	5eb3322a33	more adaptations	2024-10-16 22:11:40 +02:00
Engel Nyst	fac01d15c3	adapt stuck	2024-10-16 21:51:53 +02:00
Engel Nyst	adc960f621	actually remove history	2024-10-16 20:51:50 +02:00
Engel Nyst	1de7b2be84	rewrite history	2024-10-16 19:52:20 +02:00
Engel Nyst	66f78d59d3	more adaptations in evals	2024-10-16 19:31:27 +02:00
Engel Nyst	3a81363204	refactoring in evals	2024-10-16 18:25:53 +02:00
Engel Nyst	6fc615fd18	fix types	2024-10-16 18:01:23 +02:00
Engel Nyst	59c16d4287	Merge branch 'main' of github.com:All-Hands-AI/OpenHands into enyst/eventstream-state	2024-10-16 17:12:35 +02:00
Engel Nyst	9e5659c507	remove history.py	2024-10-16 16:42:44 +02:00
Engel Nyst	267f3befa1	add filter by hidden	2024-10-16 16:23:35 +02:00
Engel Nyst	1ee26d7437	adapt code to list	2024-10-16 16:07:15 +02:00
Engel Nyst	981335ce96	retrieve history in the controller	2024-10-16 15:35:33 +02:00
Engel Nyst	0a7fb43e4f	move compatibility method to evals	2024-10-16 04:05:23 +02:00
Engel Nyst	abda3f4fc3	wip refactor methods	2024-10-16 01:55:09 +02:00
Engel Nyst	21f82e102d	reset branch, tweak stream.py	2024-10-16 00:28:24 +02:00
Engel Nyst	97ef06f1c7	try to use a list of events as history (ATTN will require tricks with delegates) fix things async on_event	2024-10-13 23:37:25 +02:00
Engel Nyst	6f004c8467	register agent	2024-10-13 21:14:48 +02:00
Engel Nyst	744be409e5	Merge branch 'main' of github.com:All-Hands-AI/OpenHands into memory-agent	2024-10-13 20:17:51 +02:00
Engel Nyst	3775ce213c	add memory-enabled agent	2024-10-13 18:39:48 +02:00