Parse Any HTML-esh Style Tags (#2046)

* tried implementing my own regex

* improves tests

* finally works

* removes prints

* fixed test

* adds start and end

* delete unused imports

* refactored to use new tool

* significantly improved algo

* tag content -> tag attr

* fix tests + adds new field

* return full match

* return remove start and end

* update docstrings

* update docstrings

* update docstrings

---------

Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
Wael Karkoub
2024-03-26 19:46:44 +01:00
committed by GitHub
parent 59a7790578
commit 66d96dd887
3 changed files with 191 additions and 11 deletions

View File

@@ -1,14 +1,15 @@
import base64
import copy
import mimetypes
import os
import re
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Dict, List, Tuple, Union
import requests
from PIL import Image
from autogen.agentchat import utils
def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
"""
@@ -179,13 +180,9 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
last_index = 0
image_count = 0
# Regular expression pattern for matching <img ...> tags
img_tag_pattern = re.compile(r"<img ([^>]+)>")
# Find all image tags
for match in img_tag_pattern.finditer(prompt):
image_location = match.group(1)
for parsed_tag in utils.parse_tags_from_content("img", prompt):
image_location = parsed_tag["attr"]["src"]
try:
if img_format == "pil":
img_data = get_pil_image(image_location)
@@ -202,12 +199,12 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
continue
# Add text before this image tag to output list
output.append({"type": "text", "text": prompt[last_index : match.start()]})
output.append({"type": "text", "text": prompt[last_index : parsed_tag["match"].start()]})
# Add image data to output list
output.append({"type": "image_url", "image_url": {"url": img_data}})
last_index = match.end()
last_index = parsed_tag["match"].end()
image_count += 1
# Add remaining text to output list

View File

@@ -1,4 +1,6 @@
from typing import Any, List, Dict, Tuple, Callable
import re
from typing import Any, Callable, Dict, List, Tuple, Union
from .agent import Agent
@@ -76,3 +78,108 @@ def gather_usage_summary(agents: List[Agent]) -> Tuple[Dict[str, any], Dict[str,
aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)
return total_usage_summary, actual_usage_summary
def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
"""Parses HTML style tags from message contents.
The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
specified as an argument to the function. The function looks for this tag in the text and extracts its content. The
content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content
can be a single string or a set of attribute-value pairs.
Examples:
<img http://example.com/image.png> -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}]
<audio text="Hello I'm a robot" prompt="whisper"> ->
[{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}, "match": re.Match}]
Args:
tag (str): The HTML style tag to be parsed.
content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
items.
Returns:
List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
and 'match' which is a regular expression match object.
Raises:
ValueError: If the content is not a string or a list.
"""
results = []
if isinstance(content, str):
results.extend(_parse_tags_from_text(tag, content))
# Handles case for multimodal messages.
elif isinstance(content, list):
for item in content:
if item.get("type") == "text":
results.extend(_parse_tags_from_text(tag, item["text"]))
else:
raise ValueError(f"content must be str or list, but got {type(content)}")
return results
def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
pattern = re.compile(f"<{tag} (.*?)>")
results = []
for match in re.finditer(pattern, text):
tag_attr = match.group(1).strip()
attr = _parse_attributes_from_tags(tag_attr)
results.append({"tag": tag, "attr": attr, "match": match})
return results
def _parse_attributes_from_tags(tag_content: str):
pattern = r"([^ ]+)"
attrs = re.findall(pattern, tag_content)
reconstructed_attrs = _reconstruct_attributes(attrs)
def _append_src_value(content, value):
if "src" in content:
content["src"] += f" {value}"
else:
content["src"] = value
content = {}
for attr in reconstructed_attrs:
if "=" not in attr:
_append_src_value(content, attr)
continue
key, value = attr.split("=", 1)
if value.startswith("'") or value.startswith('"'):
content[key] = value[1:-1] # remove quotes
else:
_append_src_value(content, attr)
return content
def _reconstruct_attributes(attrs: List[str]) -> List[str]:
"""Reconstructs attributes from a list of strings where some attributes may be split across multiple elements."""
def is_attr(attr: str) -> bool:
if "=" in attr:
_, value = attr.split("=", 1)
if value.startswith("'") or value.startswith('"'):
return True
return False
reconstructed = []
found_attr = False
for attr in attrs:
if is_attr(attr):
reconstructed.append(attr)
found_attr = True
else:
if found_attr:
reconstructed[-1] += f" {attr}"
found_attr = True
elif reconstructed:
reconstructed[-1] += f" {attr}"
else:
reconstructed.append(attr)
return reconstructed

View File

@@ -0,0 +1,76 @@
from typing import Dict, List, Union
from autogen import agentchat
import pytest
TAG_PARSING_TESTS = [
{
"message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
},
{
"message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
"expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
},
{
"message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
"expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
},
{
"message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
"expected": [
{"tag": "img", "attr": {"src": "http://example.com/image.png"}},
{"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
],
},
{
"message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
"expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
},
{
"message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
"expected": [
{"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
{"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
],
},
{
"message": "Text with no tags",
"expected": [],
},
]
def _delete_unused_keys(d: Dict) -> None:
if "match" in d:
del d["match"]
@pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
"""Test the tag_parsing function."""
message = test_case["message"]
expected = test_case["expected"]
tags = ["img", "audio", "random"]
result = []
for tag in tags:
parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
for item in parsed_tags:
_delete_unused_keys(item)
result.extend(parsed_tags)
assert result == expected
result = []
for tag in tags:
content = [{"type": "text", "text": message}]
parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
for item in parsed_tags:
_delete_unused_keys(item)
result.extend(parsed_tags)
assert result == expected
if __name__ == "__main__":
test_tag_parsing(TAG_PARSING_TESTS[0])