Extact_code can detect single-line code now (#2)

* Extact_code can detect single-line code now

* Add comments for RE

* Add test case and adjust UNKNOWN behavior

* Remove tmp test files

* Update autogen/code_utils.py

---------

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
Beibin Li
2023-09-21 09:50:44 -07:00
committed by GitHub
parent 8a43c3bab1
commit 3627ca4f2c
2 changed files with 52 additions and 11 deletions

View File

@@ -34,24 +34,43 @@ def infer_lang(code):
return "python"
def extract_code(text: str, pattern: str = CODE_BLOCK_PATTERN) -> List[Tuple[str, str]]:
def extract_code(
text: str, pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False
) -> List[Tuple[str, str]]:
"""Extract code from a text.
Args:
text (str): The text to extract code from.
pattern (Optional, str): The regular expression pattern for finding the code block.
pattern (str, optional): The regular expression pattern for finding the
code block. Defaults to CODE_BLOCK_PATTERN.
detect_single_line_code (bool, optional): Enable the new feature for
extracting single line code. Defaults to False.
Returns:
list: A list of tuples, each containing the language and the code.
If there is no code block in the input text, the language would be "unknown".
If there is code block but the language is not specified, the language would be "".
"""
# Use a regular expression to find all the code blocks
match = re.findall(pattern, text, flags=re.DOTALL)
# match = re.search(pattern, text, flags=re.DOTALL)
# If a match is found, return the code
# if match:
# return match.group(2), match.group(1)
# If no code block is found, return the whole text
return match if match else [(UNKNOWN, text)]
if not detect_single_line_code:
match = re.findall(pattern, text, flags=re.DOTALL)
return match if match else [(UNKNOWN, text)]
# Extract both multi-line and single-line code block, separated by the | operator
# `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
# The (\w+)? matches the language, where the ? indicates it is optional.
# `([^`]+)`: Matches inline code.
code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`")
code_blocks = code_pattern.findall(text)
# Extract the individual code blocks and languages from the matched groups
extracted = []
for lang, group1, group2 in code_blocks:
if group1:
extracted.append((lang.strip(), group1.strip()))
elif group2:
extracted.append(("", group2.strip()))
return extracted
# _FIND_CODE_SYS_MSG = [

View File

@@ -161,10 +161,23 @@ Example:
```
print("hello extract code")
```
"""
""",
detect_single_line_code=False,
)
print(codeblocks)
codeblocks2 = extract_code(
"""
Example:
```
print("hello extract code")
```
""",
detect_single_line_code=True,
)
assert codeblocks2 == codeblocks
# import pdb; pdb.set_trace()
codeblocks = extract_code(
"""
Example:
@@ -190,6 +203,15 @@ print(f"Text: {text}")
codeblocks = extract_code("no code block")
assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block")
# Disable single line code detection
line = "Run `source setup.sh` from terminal"
codeblocks = extract_code(line, detect_single_line_code=False)
assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, line)
# Enable single line code detection
codeblocks = extract_code("Run `source setup.sh` from terminal", detect_single_line_code=True)
assert len(codeblocks) == 1 and codeblocks[0] == ("", "source setup.sh")
@pytest.mark.skipif(
sys.platform in ["darwin", "win32"],