Files
OpenHands/agenthub/micro/agent.py
tobitege bc31fb15fe (fix) CodeActAgent: fix issues with vision support in prompts (#3665)
* CodeActAgent: fix message prep if prompt caching is not supported

* fix python version in regen tests workflow

* fix in conftest "mock_completion" method

* add disable_vision to LLMConfig; revert change in message parsing in llm.py

* format messages in several files for completion

* refactored message(s) formatting (llm.py); added vision_is_active()

* fix a unit test

* regenerate: added LOG_TO_FILE and FORCE_REGENERATE env flags

* try to fix path to logs folder in workflow

* llm: prevent index error

* try FORCE_USE_LLM in regenerate

* tweaks everywhere...

* fix 2 random unit test errors :(

* added FORCE_REGENERATE_TESTS=true to regenerate CLI

* fix test_lint_file_fail_typescript again

* double-quotes for env vars in workflow; llm logger set to debug

* fix typo in regenerate

* regenerate iterations now 20; applied iteration counter fix by Li

* regenerate: pass FORCE_REGENERATE flag into env

* fixes for int tests. several mock files updated.

* browsing_agent: fix response_parser.py adding ) to empty response

* test_browse_internet: fix skipif and revert obsolete mock files

* regenerate: fi bracketing for http server start/kill conditions

* disable test_browse_internet for CodeAct*Agents; mock files updated after merge

* missed to include more mock files earlier

* reverts after review feedback from Li

* forgot one

* browsing agent test, partial fixes and updated mock files

* test_browse_internet works in my WSL now!

* adapt unit test test_prompt_caching.py

* add DEBUG to regenerate workflow command

* convert regenerate workflow params to inputs

* more integration test mock files updated

* more files

* test_prompt_caching: restored test_prompt_caching_headers purpose

* file_ops: fix potential exception, like "cross device copy"; fixed mock files accordingly

* reverts/changes wrt feedback from xingyao

* updated docs and config template

* code cleanup wrt review feedback
2024-09-04 17:58:30 +02:00

86 lines
3.0 KiB
Python

from jinja2 import BaseLoader, Environment
from agenthub.micro.instructions import instructions
from agenthub.micro.registry import all_microagents
from openhands.controller.agent import Agent
from openhands.controller.state.state import State
from openhands.core.config import AgentConfig
from openhands.core.message import ImageContent, Message, TextContent
from openhands.core.utils import json
from openhands.events.action import Action
from openhands.events.serialization.action import action_from_dict
from openhands.events.serialization.event import event_to_memory
from openhands.llm.llm import LLM
from openhands.memory.history import ShortTermHistory
def parse_response(orig_response: str) -> Action:
# attempt to load the JSON dict from the response
action_dict = json.loads(orig_response)
# load the action from the dict
return action_from_dict(action_dict)
def to_json(obj, **kwargs):
"""Serialize an object to str format"""
return json.dumps(obj, **kwargs)
class MicroAgent(Agent):
VERSION = '1.0'
prompt = ''
agent_definition: dict = {}
def history_to_json(
self, history: ShortTermHistory, max_events: int = 20, **kwargs
):
"""
Serialize and simplify history to str format
"""
processed_history = []
event_count = 0
for event in history.get_events(reverse=True):
if event_count >= max_events:
break
processed_history.append(
event_to_memory(event, self.llm.config.max_message_chars)
)
event_count += 1
# history is in reverse order, let's fix it
processed_history.reverse()
return json.dumps(processed_history, **kwargs)
def __init__(self, llm: LLM, config: AgentConfig):
super().__init__(llm, config)
if 'name' not in self.agent_definition:
raise ValueError('Agent definition must contain a name')
self.prompt_template = Environment(loader=BaseLoader).from_string(self.prompt)
self.delegates = all_microagents.copy()
del self.delegates[self.agent_definition['name']]
def step(self, state: State) -> Action:
last_user_message, last_image_urls = state.get_current_user_intent()
prompt = self.prompt_template.render(
state=state,
instructions=instructions,
to_json=to_json,
history_to_json=self.history_to_json,
delegates=self.delegates,
latest_user_message=last_user_message,
)
content = [TextContent(text=prompt)]
if self.llm.vision_is_active() and last_image_urls:
content.append(ImageContent(image_urls=last_image_urls))
message = Message(role='user', content=content)
resp = self.llm.completion(
messages=self.llm.format_messages_for_llm(message),
temperature=0.0,
)
action_resp = resp['choices'][0]['message']['content']
action = parse_response(action_resp)
return action