Refactor monologue to use the messages in state history

remove now unused method
2026-01-09 14:57:59 -05:00 · 2024-05-17 17:40:45 +02:00
parent 7ca560471b
commit 76b4b765ef
2 changed files with 112 additions and 94 deletions
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@@ -1,4 +1,5 @@
 import agenthub.monologue_agent.utils.prompts as prompts
+from agenthub.monologue_agent.utils.prompts import INITIAL_THOUGHTS
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
 from opendevin.core.config import config
@@ -25,7 +26,6 @@ from opendevin.events.observation import (
 from opendevin.events.serialization.event import event_to_memory
 from opendevin.llm.llm import LLM
 from opendevin.memory.condenser import MemoryCondenser
-from opendevin.memory.history import ShortTermHistory

 if config.agent.memory_enabled:
    from opendevin.memory.memory import LongTermMemory
@@ -33,50 +33,33 @@ if config.agent.memory_enabled:
 MAX_TOKEN_COUNT_PADDING = 512
 MAX_OUTPUT_LENGTH = 5000

-INITIAL_THOUGHTS = [
-    'I exist!',
-    'Hmm...looks like I can type in a command line prompt',
-    'Looks like I have a web browser too!',
-    "Here's what I want to do: $TASK",
-    'How am I going to get there though?',
-    'It seems like I have some kind of short term memory.',
-    'Each of my thoughts seems to be stored in a JSON array.',
-    'It seems whatever I say next will be added as an object to the list.',
-    'But no one has perfect short-term memory. My list of thoughts will be summarized and condensed over time, losing information in the process.',
-    'Fortunately I have long term memory!',
-    'I can just perform a recall action, followed by the thing I want to remember. And then related thoughts just spill out!',
-    "Sometimes they're random thoughts that don't really have to do with what I wanted to remember. But usually they're exactly what I need!",
-    "Let's try it out!",
-    'RECALL what it is I want to do',
-    "Here's what I want to do: $TASK",
-    'How am I going to get there though?',
-    "Neat! And it looks like it's easy for me to use the command line too! I just have to perform a run action and include the command I want to run in the command argument. The command output just jumps into my head!",
-    'RUN echo "hello world"',
-    'hello world',
-    'Cool! I bet I can write files too using the write action.',
-    'WRITE echo "console.log(\'hello world\')" > test.js',
-    '',
-    "I just created test.js. I'll try and run it now.",
-    'RUN node test.js',
-    'hello world',
-    'It works!',
-    "I'm going to try reading it now using the read action.",
-    'READ test.js',
-    "console.log('hello world')",
-    'Nice! I can read files too!',
-    'And if I want to use the browser, I just need to use the browse action and include the url I want to visit in the url argument',
-    "Let's try that...",
-    'BROWSE google.com',
-    '<form><input type="text"></input><button type="submit"></button></form>',
-    'I can browse the web too!',
-    'And once I have completed my task, I can use the finish action to stop working.',
-    "But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work.",
-    'Very cool. Now to accomplish my task.',
-    "I'll need a strategy. And as I make progress, I'll need to keep refining that strategy. I'll need to set goals, and break them into sub-goals.",
-    'In between actions, I must always take some time to think, strategize, and set new goals. I should never take two actions in a row.',
-    "OK so my task is to $TASK. I haven't made any progress yet. Where should I start?",
-    'It seems like there might be an existing project here. I should probably start by running `pwd` and `ls` to orient myself.',
-]
+
+def truncate_output(
+    observation: dict, max_chars: int = MAX_OUTPUT_LENGTH
+) -> dict[str, str]:
+    """
+    Truncates the output of an observation to a maximum number of characters.
+
+    Parameters:
+    - output (str): The observation whose output to truncate
+    - max_chars (int): The maximum number of characters to allow
+
+    Returns:
+    - str: The truncated output
+    """
+    if (
+        'args' in observation
+        and 'output' in observation['args']
+        and len(observation['args']['output']) > max_chars
+    ):
+        output = observation['args']['output']
+        half = max_chars // 2
+        observation['args']['output'] = (
+            output[:half]
+            + '\n[... Output truncated due to length...]\n'
+            + output[-half:]
+        )
+    return observation


 class MonologueAgent(Agent):
@@ -87,7 +70,7 @@ class MonologueAgent(Agent):
    """

    _initialized = False
-    monologue: ShortTermHistory
+    monologue: list[dict[str, str]]  # initial thoughts ONLY
    memory: 'LongTermMemory | None'
    memory_condenser: MemoryCondenser

@@ -100,44 +83,6 @@ class MonologueAgent(Agent):
        """
        super().__init__(llm)

-    def _add_event(self, event_dict: dict):
-        """
-        Adds a new event to the agent's monologue and memory.
-        Monologue automatically condenses when it gets too large.
-
-        Parameters:
-        - event (dict): The event that will be added to monologue and memory
-        """
-
-        if (
-            'args' in event_dict
-            and 'output' in event_dict['args']
-            and len(event_dict['args']['output']) > MAX_OUTPUT_LENGTH
-        ):
-            event_dict['args']['output'] = (
-                event_dict['args']['output'][:MAX_OUTPUT_LENGTH] + '...'
-            )
-
-        self.monologue.add_event(event_dict)
-        if self.memory is not None:
-            self.memory.add_event(event_dict)
-
-        # Test monologue token length
-        prompt = prompts.get_request_action_prompt(
-            '',
-            self.monologue.get_events(),
-            [],
-        )
-        messages = [{'content': prompt, 'role': 'user'}]
-        token_count = self.llm.get_token_count(messages)
-
-        if token_count + MAX_TOKEN_COUNT_PADDING > self.llm.max_input_tokens:
-            prompt = prompts.get_summarize_monologue_prompt(self.monologue.events)
-            summary_response = self.memory_condenser.condense(
-                summarize_prompt=prompt, llm=self.llm
-            )
-            self.monologue.events = prompts.parse_summary_response(summary_response)
-
    def _initialize(self, task: str):
        """
        Utilizes the INITIAL_THOUGHTS list to give the agent a context for its capabilities
@@ -158,7 +103,7 @@ class MonologueAgent(Agent):
        if task is None or task == '':
            raise AgentNoInstructionError()

-        self.monologue = ShortTermHistory()
+        self.monologue = []
        if config.agent.memory_enabled:
            self.memory = LongTermMemory()
        else:
@@ -187,7 +132,7 @@ class MonologueAgent(Agent):
                    observation = BrowserOutputObservation(
                        content=thought, url='', screenshot=''
                    )
-                self._add_event(event_to_memory(observation))
+                self.monologue.append(event_to_memory(observation))
                previous_action = ''
            else:
                action: Action = NullAction()
@@ -214,7 +159,7 @@ class MonologueAgent(Agent):
                    previous_action = ActionType.BROWSE
                else:
                    action = MessageAction(thought)
-                self._add_event(event_to_memory(action))
+                self.monologue.append(event_to_memory(action))

    def step(self, state: State) -> Action:
        """
@@ -229,21 +174,49 @@ class MonologueAgent(Agent):

        goal = state.get_current_user_intent()
        self._initialize(goal)
-        for prev_action, obs in state.updated_info:
-            self._add_event(event_to_memory(prev_action))
-            self._add_event(event_to_memory(obs))
-
-        state.updated_info = []

+        # the action prompt starts with the initial thoughts
        prompt = prompts.get_request_action_prompt(
            goal,
-            self.monologue.get_events(),
+            self.monologue,
            state.background_commands_obs,
        )
-        messages = [{'content': prompt, 'role': 'user'}]
+        messages: list[dict[str, str]] = [
+            {'role': 'system', 'content': prompt},
+        ]
+
+        # add the messages from state.history
+        for prev_action, obs in state.history:
+            messages.append(event_to_memory(prev_action))
+            messages.append(truncate_output(event_to_memory(obs)))
+
+        # add the last messages to long term memory
+        if self.memory is not None and state.history and len(state.history) > 0:
+            self.memory.add_event(event_to_memory(state.history[-1][0]))
+            self.memory.add_event(
+                truncate_output(event_to_memory(state.history[-1][1]))
+            )
+
+        # FIXME this has to go
+        # Test monologue token length
+        token_count = self.llm.get_token_count(messages[1:])
+        if token_count + MAX_TOKEN_COUNT_PADDING > self.llm.max_input_tokens:
+            prompt = prompts.get_summarize_monologue_prompt(messages[1:])
+            summary_response = self.memory_condenser.condense(
+                summarize_prompt=prompt, llm=self.llm
+            )
+            prompts.parse_summary_response(summary_response)
+        # end FIXME
+
+        # request the next action from the LLM
        resp = self.llm.completion(messages=messages)
+
+        # get the next action from the response
        action_resp = resp['choices'][0]['message']['content']
+
+        # keep track of a fallback option to stop the prompting when it gets too large
        state.num_of_chars += len(prompt) + len(action_resp)
+
        action = prompts.parse_action_response(action_resp)
        self.latest_action = action
        return action
--- a/agenthub/monologue_agent/utils/prompts.py
+++ b/agenthub/monologue_agent/utils/prompts.py
@@ -92,6 +92,51 @@ The action key may be `summarize`, and `args.summary` should contain the summary
 You can also use the same action and args from the source monologue.
 """

+INITIAL_THOUGHTS = [
+    'I exist!',
+    'Hmm...looks like I can type in a command line prompt',
+    'Looks like I have a web browser too!',
+    "Here's what I want to do: $TASK",
+    'How am I going to get there though?',
+    'It seems like I have some kind of short term memory.',
+    'Each of my thoughts seems to be stored in a JSON array.',
+    'It seems whatever I say next will be added as an object to the list.',
+    'But no one has perfect short-term memory. My list of thoughts will be summarized and condensed over time, losing information in the process.',
+    'Fortunately I have long term memory!',
+    'I can just perform a recall action, followed by the thing I want to remember. And then related thoughts just spill out!',
+    "Sometimes they're random thoughts that don't really have to do with what I wanted to remember. But usually they're exactly what I need!",
+    "Let's try it out!",
+    'RECALL what it is I want to do',
+    "Here's what I want to do: $TASK",
+    'How am I going to get there though?',
+    "Neat! And it looks like it's easy for me to use the command line too! I just have to perform a run action and include the command I want to run in the command argument. The command output just jumps into my head!",
+    'RUN echo "hello world"',
+    'hello world',
+    'Cool! I bet I can write files too using the write action.',
+    'WRITE echo "console.log(\'hello world\')" > test.js',
+    '',
+    "I just created test.js. I'll try and run it now.",
+    'RUN node test.js',
+    'hello world',
+    'It works!',
+    "I'm going to try reading it now using the read action.",
+    'READ test.js',
+    "console.log('hello world')",
+    'Nice! I can read files too!',
+    'And if I want to use the browser, I just need to use the browse action and include the url I want to visit in the url argument',
+    "Let's try that...",
+    'BROWSE google.com',
+    '<form><input type="text"></input><button type="submit"></button></form>',
+    'I can browse the web too!',
+    'And once I have completed my task, I can use the finish action to stop working.',
+    "But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work.",
+    'Very cool. Now to accomplish my task.',
+    "I'll need a strategy. And as I make progress, I'll need to keep refining that strategy. I'll need to set goals, and break them into sub-goals.",
+    'In between actions, I must always take some time to think, strategize, and set new goals. I should never take two actions in a row.',
+    "OK so my task is to $TASK. I haven't made any progress yet. Where should I start?",
+    'It seems like there might be an existing project here. I should probably start by running `pwd` and `ls` to orient myself.',
+]
+

 def get_summarize_monologue_prompt(thoughts: list[dict]):
    """