improve CODE_BLOCK_PATTERN for a more robust code match (#571)

* improve CODE_BLOCK_PATTERN for more robust match

* improve and add tests

* Add support for \r\n

* Updated the regex to support indented code blocks (per the Markdown spec). Added test cases for both.

* Update formatting

---------

Co-authored-by: Adam Fourney <adamfo@microsoft.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
Dear.Va
2023-11-21 13:06:56 +08:00
committed by GitHub
parent 19c7da2dd1
commit d22664f7e7
2 changed files with 85 additions and 7 deletions

View File

@@ -19,7 +19,15 @@ except ImportError:
DEFAULT_MODEL = "gpt-4"
FAST_MODEL = "gpt-3.5-turbo"
# Regular expression for finding a code block
CODE_BLOCK_PATTERN = r"```(\w*)\n(.*?)\n```"
# ```[ \t]*(\w+)?[ \t]*\r?\n(.*?)[ \t]*\r?\n``` Matches multi-line code blocks.
# The [ \t]* matches the potential spaces before language name.
# The (\w+)? matches the language, where the ? indicates it is optional.
# The [ \t]* matches the potential spaces (not newlines) after language name.
# The \r?\n makes sure there is a linebreak after ```.
# The (.*?) matches the code itself (non-greedy).
# The \r?\n makes sure there is a linebreak before ```.
# The [ \t]* matches the potential spaces before closing ``` (the spec allows indentation).
CODE_BLOCK_PATTERN = r"```[ \t]*(\w+)?[ \t]*\r?\n(.*?)\r?\n[ \t]*```"
WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extensions")
UNKNOWN = "unknown"
TIMEOUT_MSG = "Timeout"
@@ -59,6 +67,8 @@ def infer_lang(code):
return UNKNOWN
# TODO: In the future move, to better support https://spec.commonmark.org/0.30/#fenced-code-blocks
# perhaps by using a full Markdown parser.
def extract_code(
text: Union[str, List], pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False
) -> List[Tuple[str, str]]:
@@ -83,10 +93,8 @@ def extract_code(
return match if match else [(UNKNOWN, text)]
# Extract both multi-line and single-line code block, separated by the | operator
# `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
# The (\w+)? matches the language, where the ? indicates it is optional.
# `([^`]+)`: Matches inline code.
code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`")
code_pattern = re.compile(CODE_BLOCK_PATTERN + r"|`([^`]+)`")
code_blocks = code_pattern.findall(text)
# Extract the individual code blocks and languages from the matched groups

View File

@@ -185,6 +185,8 @@ print("hello extract code")
""",
detect_single_line_code=True,
)
print(codeblocks2)
assert codeblocks2 == codeblocks
# import pdb; pdb.set_trace()
@@ -207,9 +209,77 @@ url = "https://en.wikipedia.org/wiki/Web_scraping"
title, text = scrape(url)
print(f"Title: {title}")
print(f"Text: {text}")
```
"""
)
print(codeblocks)
assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
codeblocks = extract_code(
"""
Example:
``` python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
Test:
``` python
url = "https://en.wikipedia.org/wiki/Web_scraping"
title, text = scrape(url)
print(f"Title: {title}")
print(f"Text: {text}")
```
"""
)
print(codeblocks)
assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
# Check for indented code blocks
codeblocks = extract_code(
"""
Example:
```python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
"""
)
print(codeblocks)
assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
# Check for codeblocks with \r\n
codeblocks = extract_code(
"""
Example:
``` python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
""".replace(
"\n", "\r\n"
)
)
print(codeblocks)
assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
codeblocks = extract_code("no code block")
assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block")
@@ -348,7 +418,7 @@ class TestContentStr(unittest.TestCase):
if __name__ == "__main__":
# test_infer_lang()
# test_extract_code()
test_execute_code()
test_extract_code()
# test_execute_code()
# test_find_code()
unittest.main()
# unittest.main()