improve CODE_BLOCK_PATTERN for a more robust code match (#571)

* improve CODE_BLOCK_PATTERN for more robust match * improve and add tests * Add support for \r\n * Updated the regex to support indented code blocks (per the Markdown spec). Added test cases for both. * Update formatting --------- Co-authored-by: Adam Fourney <adamfo@microsoft.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com>
2026-04-20 03:02:16 -04:00 · 2023-11-21 13:06:56 +08:00
parent 19c7da2dd1
commit d22664f7e7
2 changed files with 85 additions and 7 deletions
--- a/autogen/code_utils.py
+++ b/autogen/code_utils.py
@@ -19,7 +19,15 @@ except ImportError:
 DEFAULT_MODEL = "gpt-4"
 FAST_MODEL = "gpt-3.5-turbo"
 # Regular expression for finding a code block
-CODE_BLOCK_PATTERN = r"```(\w*)\n(.*?)\n```"
+# ```[ \t]*(\w+)?[ \t]*\r?\n(.*?)[ \t]*\r?\n``` Matches multi-line code blocks.
+#   The [ \t]* matches the potential spaces before language name.
+#   The (\w+)? matches the language, where the ? indicates it is optional.
+#   The [ \t]* matches the potential spaces (not newlines) after language name.
+#   The \r?\n makes sure there is a linebreak after ```.
+#   The (.*?) matches the code itself (non-greedy).
+#   The \r?\n makes sure there is a linebreak before ```.
+#   The [ \t]* matches the potential spaces before closing ``` (the spec allows indentation).
+CODE_BLOCK_PATTERN = r"```[ \t]*(\w+)?[ \t]*\r?\n(.*?)\r?\n[ \t]*```"
 WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extensions")
 UNKNOWN = "unknown"
 TIMEOUT_MSG = "Timeout"
@@ -59,6 +67,8 @@ def infer_lang(code):
        return UNKNOWN


+# TODO: In the future move, to better support https://spec.commonmark.org/0.30/#fenced-code-blocks
+#       perhaps by using a full Markdown parser.
 def extract_code(
    text: Union[str, List], pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False
 ) -> List[Tuple[str, str]]:
@@ -83,10 +93,8 @@ def extract_code(
        return match if match else [(UNKNOWN, text)]

    # Extract both multi-line and single-line code block, separated by the | operator
-    # `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
-    #    The (\w+)? matches the language, where the ? indicates it is optional.
    # `([^`]+)`: Matches inline code.
-    code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`")
+    code_pattern = re.compile(CODE_BLOCK_PATTERN + r"|`([^`]+)`")
    code_blocks = code_pattern.findall(text)

    # Extract the individual code blocks and languages from the matched groups
--- a/test/test_code.py
+++ b/test/test_code.py
@@ -185,6 +185,8 @@ print("hello extract code")
 """,
        detect_single_line_code=True,
    )
+    print(codeblocks2)
+
    assert codeblocks2 == codeblocks
    # import pdb; pdb.set_trace()

@@ -207,9 +209,77 @@ url = "https://en.wikipedia.org/wiki/Web_scraping"
 title, text = scrape(url)
 print(f"Title: {title}")
 print(f"Text: {text}")
+```
 """
    )
    print(codeblocks)
+    assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
+
+    codeblocks = extract_code(
+        """
+Example:
+``` python
+def scrape(url):
+    import requests
+    from bs4 import BeautifulSoup
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    title = soup.find("title").text
+    text = soup.find("div", {"id": "bodyContent"}).text
+    return title, text
+```
+Test:
+``` python
+url = "https://en.wikipedia.org/wiki/Web_scraping"
+title, text = scrape(url)
+print(f"Title: {title}")
+print(f"Text: {text}")
+```
+"""
+    )
+    print(codeblocks)
+    assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"
+
+    # Check for indented code blocks
+    codeblocks = extract_code(
+        """
+Example:
+   ```python
+   def scrape(url):
+       import requests
+       from bs4 import BeautifulSoup
+       response = requests.get(url)
+       soup = BeautifulSoup(response.text, "html.parser")
+       title = soup.find("title").text
+       text = soup.find("div", {"id": "bodyContent"}).text
+       return title, text
+   ```
+"""
+    )
+    print(codeblocks)
+    assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
+
+    # Check for codeblocks with \r\n
+    codeblocks = extract_code(
+        """
+Example:
+``` python
+def scrape(url):
+   import requests
+   from bs4 import BeautifulSoup
+   response = requests.get(url)
+   soup = BeautifulSoup(response.text, "html.parser")
+   title = soup.find("title").text
+   text = soup.find("div", {"id": "bodyContent"}).text
+   return title, text
+```
+""".replace(
+            "\n", "\r\n"
+        )
+    )
+    print(codeblocks)
+    assert len(codeblocks) == 1 and codeblocks[0][0] == "python"
+
    codeblocks = extract_code("no code block")
    assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block")

@@ -348,7 +418,7 @@ class TestContentStr(unittest.TestCase):

 if __name__ == "__main__":
    # test_infer_lang()
-    # test_extract_code()
-    test_execute_code()
+    test_extract_code()
+    # test_execute_code()
    # test_find_code()
-    unittest.main()
+    # unittest.main()