mirror of
https://github.com/microsoft/autogen.git
synced 2026-01-26 11:58:09 -05:00
Parse Any HTML-esh Style Tags (#2046)
* tried implementing my own regex * improves tests * finally works * removes prints * fixed test * adds start and end * delete unused imports * refactored to use new tool * significantly improved algo * tag content -> tag attr * fix tests + adds new field * return full match * return remove start and end * update docstrings * update docstrings * update docstrings --------- Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
@@ -1,14 +1,15 @@
|
||||
import base64
|
||||
import copy
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import Dict, List, Tuple, Union
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from autogen.agentchat import utils
|
||||
|
||||
|
||||
def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
|
||||
"""
|
||||
@@ -179,13 +180,9 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
|
||||
last_index = 0
|
||||
image_count = 0
|
||||
|
||||
# Regular expression pattern for matching <img ...> tags
|
||||
img_tag_pattern = re.compile(r"<img ([^>]+)>")
|
||||
|
||||
# Find all image tags
|
||||
for match in img_tag_pattern.finditer(prompt):
|
||||
image_location = match.group(1)
|
||||
|
||||
for parsed_tag in utils.parse_tags_from_content("img", prompt):
|
||||
image_location = parsed_tag["attr"]["src"]
|
||||
try:
|
||||
if img_format == "pil":
|
||||
img_data = get_pil_image(image_location)
|
||||
@@ -202,12 +199,12 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
|
||||
continue
|
||||
|
||||
# Add text before this image tag to output list
|
||||
output.append({"type": "text", "text": prompt[last_index : match.start()]})
|
||||
output.append({"type": "text", "text": prompt[last_index : parsed_tag["match"].start()]})
|
||||
|
||||
# Add image data to output list
|
||||
output.append({"type": "image_url", "image_url": {"url": img_data}})
|
||||
|
||||
last_index = match.end()
|
||||
last_index = parsed_tag["match"].end()
|
||||
image_count += 1
|
||||
|
||||
# Add remaining text to output list
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from typing import Any, List, Dict, Tuple, Callable
|
||||
import re
|
||||
from typing import Any, Callable, Dict, List, Tuple, Union
|
||||
|
||||
from .agent import Agent
|
||||
|
||||
|
||||
@@ -76,3 +78,108 @@ def gather_usage_summary(agents: List[Agent]) -> Tuple[Dict[str, any], Dict[str,
|
||||
aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)
|
||||
|
||||
return total_usage_summary, actual_usage_summary
|
||||
|
||||
|
||||
def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
|
||||
"""Parses HTML style tags from message contents.
|
||||
|
||||
The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
|
||||
specified as an argument to the function. The function looks for this tag in the text and extracts its content. The
|
||||
content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content
|
||||
can be a single string or a set of attribute-value pairs.
|
||||
|
||||
Examples:
|
||||
<img http://example.com/image.png> -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}]
|
||||
<audio text="Hello I'm a robot" prompt="whisper"> ->
|
||||
[{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}, "match": re.Match}]
|
||||
|
||||
Args:
|
||||
tag (str): The HTML style tag to be parsed.
|
||||
content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
|
||||
items.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
|
||||
contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
|
||||
and 'match' which is a regular expression match object.
|
||||
|
||||
Raises:
|
||||
ValueError: If the content is not a string or a list.
|
||||
"""
|
||||
results = []
|
||||
if isinstance(content, str):
|
||||
results.extend(_parse_tags_from_text(tag, content))
|
||||
# Handles case for multimodal messages.
|
||||
elif isinstance(content, list):
|
||||
for item in content:
|
||||
if item.get("type") == "text":
|
||||
results.extend(_parse_tags_from_text(tag, item["text"]))
|
||||
else:
|
||||
raise ValueError(f"content must be str or list, but got {type(content)}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
|
||||
pattern = re.compile(f"<{tag} (.*?)>")
|
||||
|
||||
results = []
|
||||
for match in re.finditer(pattern, text):
|
||||
tag_attr = match.group(1).strip()
|
||||
attr = _parse_attributes_from_tags(tag_attr)
|
||||
|
||||
results.append({"tag": tag, "attr": attr, "match": match})
|
||||
return results
|
||||
|
||||
|
||||
def _parse_attributes_from_tags(tag_content: str):
|
||||
pattern = r"([^ ]+)"
|
||||
attrs = re.findall(pattern, tag_content)
|
||||
reconstructed_attrs = _reconstruct_attributes(attrs)
|
||||
|
||||
def _append_src_value(content, value):
|
||||
if "src" in content:
|
||||
content["src"] += f" {value}"
|
||||
else:
|
||||
content["src"] = value
|
||||
|
||||
content = {}
|
||||
for attr in reconstructed_attrs:
|
||||
if "=" not in attr:
|
||||
_append_src_value(content, attr)
|
||||
continue
|
||||
|
||||
key, value = attr.split("=", 1)
|
||||
if value.startswith("'") or value.startswith('"'):
|
||||
content[key] = value[1:-1] # remove quotes
|
||||
else:
|
||||
_append_src_value(content, attr)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def _reconstruct_attributes(attrs: List[str]) -> List[str]:
|
||||
"""Reconstructs attributes from a list of strings where some attributes may be split across multiple elements."""
|
||||
|
||||
def is_attr(attr: str) -> bool:
|
||||
if "=" in attr:
|
||||
_, value = attr.split("=", 1)
|
||||
if value.startswith("'") or value.startswith('"'):
|
||||
return True
|
||||
return False
|
||||
|
||||
reconstructed = []
|
||||
found_attr = False
|
||||
for attr in attrs:
|
||||
if is_attr(attr):
|
||||
reconstructed.append(attr)
|
||||
found_attr = True
|
||||
else:
|
||||
if found_attr:
|
||||
reconstructed[-1] += f" {attr}"
|
||||
found_attr = True
|
||||
elif reconstructed:
|
||||
reconstructed[-1] += f" {attr}"
|
||||
else:
|
||||
reconstructed.append(attr)
|
||||
return reconstructed
|
||||
|
||||
76
test/agentchat/test_agentchat_utils.py
Normal file
76
test/agentchat/test_agentchat_utils.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from typing import Dict, List, Union
|
||||
from autogen import agentchat
|
||||
import pytest
|
||||
|
||||
TAG_PARSING_TESTS = [
|
||||
{
|
||||
"message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
|
||||
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
|
||||
},
|
||||
{
|
||||
"message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
|
||||
"expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
|
||||
},
|
||||
{
|
||||
"message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
|
||||
"expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
|
||||
},
|
||||
{
|
||||
"message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
|
||||
"expected": [
|
||||
{"tag": "img", "attr": {"src": "http://example.com/image.png"}},
|
||||
{"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
|
||||
],
|
||||
},
|
||||
{
|
||||
"message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
|
||||
"expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
|
||||
},
|
||||
{
|
||||
"message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
|
||||
"expected": [
|
||||
{"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
|
||||
{"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
|
||||
],
|
||||
},
|
||||
{
|
||||
"message": "Text with no tags",
|
||||
"expected": [],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _delete_unused_keys(d: Dict) -> None:
|
||||
if "match" in d:
|
||||
del d["match"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
|
||||
def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
|
||||
"""Test the tag_parsing function."""
|
||||
message = test_case["message"]
|
||||
expected = test_case["expected"]
|
||||
tags = ["img", "audio", "random"]
|
||||
|
||||
result = []
|
||||
for tag in tags:
|
||||
parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
|
||||
for item in parsed_tags:
|
||||
_delete_unused_keys(item)
|
||||
|
||||
result.extend(parsed_tags)
|
||||
assert result == expected
|
||||
|
||||
result = []
|
||||
for tag in tags:
|
||||
content = [{"type": "text", "text": message}]
|
||||
parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
|
||||
for item in parsed_tags:
|
||||
_delete_unused_keys(item)
|
||||
|
||||
result.extend(parsed_tags)
|
||||
assert result == expected
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_tag_parsing(TAG_PARSING_TESTS[0])
|
||||
Reference in New Issue
Block a user