mirror of
https://github.com/microsoft/autogen.git
synced 2026-04-20 03:02:16 -04:00
improve CODE_BLOCK_PATTERN for a more robust code match (#571)
* improve CODE_BLOCK_PATTERN for more robust match * improve and add tests * Add support for \r\n * Updated the regex to support indented code blocks (per the Markdown spec). Added test cases for both. * Update formatting --------- Co-authored-by: Adam Fourney <adamfo@microsoft.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
@@ -19,7 +19,15 @@ except ImportError:
|
||||
DEFAULT_MODEL = "gpt-4"
|
||||
FAST_MODEL = "gpt-3.5-turbo"
|
||||
# Regular expression for finding a code block
|
||||
CODE_BLOCK_PATTERN = r"```(\w*)\n(.*?)\n```"
|
||||
# ```[ \t]*(\w+)?[ \t]*\r?\n(.*?)[ \t]*\r?\n``` Matches multi-line code blocks.
|
||||
# The [ \t]* matches the potential spaces before language name.
|
||||
# The (\w+)? matches the language, where the ? indicates it is optional.
|
||||
# The [ \t]* matches the potential spaces (not newlines) after language name.
|
||||
# The \r?\n makes sure there is a linebreak after ```.
|
||||
# The (.*?) matches the code itself (non-greedy).
|
||||
# The \r?\n makes sure there is a linebreak before ```.
|
||||
# The [ \t]* matches the potential spaces before closing ``` (the spec allows indentation).
|
||||
CODE_BLOCK_PATTERN = r"```[ \t]*(\w+)?[ \t]*\r?\n(.*?)\r?\n[ \t]*```"
|
||||
WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extensions")
|
||||
UNKNOWN = "unknown"
|
||||
TIMEOUT_MSG = "Timeout"
|
||||
@@ -59,6 +67,8 @@ def infer_lang(code):
|
||||
return UNKNOWN
|
||||
|
||||
|
||||
# TODO: In the future move, to better support https://spec.commonmark.org/0.30/#fenced-code-blocks
|
||||
# perhaps by using a full Markdown parser.
|
||||
def extract_code(
|
||||
text: Union[str, List], pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False
|
||||
) -> List[Tuple[str, str]]:
|
||||
@@ -83,10 +93,8 @@ def extract_code(
|
||||
return match if match else [(UNKNOWN, text)]
|
||||
|
||||
# Extract both multi-line and single-line code block, separated by the | operator
|
||||
# `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
|
||||
# The (\w+)? matches the language, where the ? indicates it is optional.
|
||||
# `([^`]+)`: Matches inline code.
|
||||
code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`")
|
||||
code_pattern = re.compile(CODE_BLOCK_PATTERN + r"|`([^`]+)`")
|
||||
code_blocks = code_pattern.findall(text)
|
||||
|
||||
# Extract the individual code blocks and languages from the matched groups
|
||||
|
||||
@@ -185,6 +185,8 @@ print("hello extract code")
|
||||
""",
|
||||
detect_single_line_code=True,
|
||||
)
|
||||
print(codeblocks2)
|
||||
|
||||
assert codeblocks2 == codeblocks
|
||||
# import pdb; pdb.set_trace()
|
||||
|
||||
@@ -207,9 +209,77 @@ url = "https://en.wikipedia.org/wiki/Web_scraping"
|
||||
title, text = scrape(url)
|
||||
print(f"Title: {title}")
|
||||
print(f"Text: {text}")
|
||||
```
|
||||
"""
|
||||
)
|
||||
print(codeblocks)
|
||||
assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
|
||||
|
||||
codeblocks = extract_code(
|
||||
"""
|
||||
Example:
|
||||
``` python
|
||||
def scrape(url):
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
title = soup.find("title").text
|
||||
text = soup.find("div", {"id": "bodyContent"}).text
|
||||
return title, text
|
||||
```
|
||||
Test:
|
||||
``` python
|
||||
url = "https://en.wikipedia.org/wiki/Web_scraping"
|
||||
title, text = scrape(url)
|
||||
print(f"Title: {title}")
|
||||
print(f"Text: {text}")
|
||||
```
|
||||
"""
|
||||
)
|
||||
print(codeblocks)
|
||||
assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
|
||||
|
||||
# Check for indented code blocks
|
||||
codeblocks = extract_code(
|
||||
"""
|
||||
Example:
|
||||
```python
|
||||
def scrape(url):
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
title = soup.find("title").text
|
||||
text = soup.find("div", {"id": "bodyContent"}).text
|
||||
return title, text
|
||||
```
|
||||
"""
|
||||
)
|
||||
print(codeblocks)
|
||||
assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
|
||||
|
||||
# Check for codeblocks with \r\n
|
||||
codeblocks = extract_code(
|
||||
"""
|
||||
Example:
|
||||
``` python
|
||||
def scrape(url):
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
title = soup.find("title").text
|
||||
text = soup.find("div", {"id": "bodyContent"}).text
|
||||
return title, text
|
||||
```
|
||||
""".replace(
|
||||
"\n", "\r\n"
|
||||
)
|
||||
)
|
||||
print(codeblocks)
|
||||
assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
|
||||
|
||||
codeblocks = extract_code("no code block")
|
||||
assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block")
|
||||
|
||||
@@ -348,7 +418,7 @@ class TestContentStr(unittest.TestCase):
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test_infer_lang()
|
||||
# test_extract_code()
|
||||
test_execute_code()
|
||||
test_extract_code()
|
||||
# test_execute_code()
|
||||
# test_find_code()
|
||||
unittest.main()
|
||||
# unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user