Files
gpt-pilot/core/llm/parser.py
Senko Rasic 5b474ccc1f merge gpt-pilot 0.2 codebase
This is a complete rewrite of the GPT Pilot core, from the ground
up, making the agentic architecture front and center, and also
fixing some long-standing problems with the database architecture
that weren't feasible to solve without breaking compatibility.

As the database structure and config file syntax have changed,
we have automatic imports for projects and current configs,
see the README.md file for details.

This also relicenses the project to FSL-1.1-MIT license.
2024-05-22 21:42:25 +02:00

162 lines
4.7 KiB
Python

import json
import re
from enum import Enum
from typing import Optional, Union
from pydantic import BaseModel, ValidationError
class MultiCodeBlockParser:
"""
Parse multiple Markdown code blocks from a string.
Expects zero or more blocks, and ignores any text
outside of the code blocks.
Example usage:
>>> parser = MultiCodeBlockParser()
>>> text = '''
... text outside block
...
... ```python
... first block
... ```
... some text between blocks
... ```js
... more
... code
... ```
... some text after blocks
'''
>>> assert parser(text) == ["first block", "more\ncode"]
If no code blocks are found, an empty list is returned:
"""
def __init__(self):
# FIXME: ``` should be the only content on the line`
self.pattern = re.compile(r"```([a-z0-9]+\n)?(.*?)```\s*", re.DOTALL)
def __call__(self, text: str) -> list[str]:
blocks = []
for block in self.pattern.findall(text):
blocks.append(block[1].strip())
return blocks
class CodeBlockParser(MultiCodeBlockParser):
"""
Parse a Markdown code block from a string.
Expects exactly one code block, and ignores
any text before or after it.
Usage:
>>> parser = CodeBlockParser()
>>> text = "text\n```py\ncodeblock\n'''\nmore text"
>>> assert parser(text) == "codeblock"
This is a special case of MultiCodeBlockParser,
checking that there's exactly one block.
"""
def __call__(self, text: str) -> str:
blocks = super().__call__(text)
# FIXME: if there are more than 1 code block, this means the output actually contains ```,
# so re-parse this with that in mind
if len(blocks) != 1:
raise ValueError(f"Expected a single code block, got {len(blocks)}")
return blocks[0]
class OptionalCodeBlockParser:
def __call__(self, text: str) -> str:
text = text.strip()
if text.startswith("```") and text.endswith("\n```"):
# Remove the first and last line. Note the first line may include syntax
# highlighting, so we can't just remove the first 3 characters.
text = "\n".join(text.splitlines()[1:-1]).strip()
return text
class JSONParser:
def __init__(self, spec: Optional[BaseModel] = None, strict: bool = True):
self.spec = spec
self.strict = strict or (spec is not None)
@property
def schema(self):
return self.spec.model_json_schema() if self.spec else None
@staticmethod
def errors_to_markdown(errors: list) -> str:
error_txt = []
for error in errors:
loc = ".".join(str(loc) for loc in error["loc"])
etype = error["type"]
msg = error["msg"]
error_txt.append(f"- `{loc}`: {etype} ({msg})")
return "\n".join(error_txt)
def __call__(self, text: str) -> Union[BaseModel, dict, None]:
text = text.strip()
if text.startswith("```"):
try:
text = CodeBlockParser()(text)
except ValueError:
if self.strict:
raise
else:
return None
try:
data = json.loads(text.strip())
except json.JSONDecodeError as e:
if self.strict:
raise ValueError(f"JSON is not valid: {e}") from e
else:
return None
if self.spec is None:
return data
try:
model = self.spec(**data)
except ValidationError as err:
errtxt = self.errors_to_markdown(err.errors())
raise ValueError(f"Invalid JSON format:\n{errtxt}") from err
except Exception as err:
raise ValueError(f"Error parsing JSON: {err}") from err
return model
class EnumParser:
def __init__(self, spec: Enum, ignore_case: bool = True):
self.spec = spec
self.ignore_case = ignore_case
def __call__(self, text: str) -> Enum:
text = text.strip()
if self.ignore_case:
text = text.lower()
try:
return self.spec(text)
except ValueError as e:
options = ", ".join([str(v) for v in self.spec])
raise ValueError(f"Invalid option '{text}'; valid options: {options}") from e
class StringParser:
def __call__(self, text: str) -> str:
# Strip any leading and trailing whitespace
text = text.strip()
# Check and remove quotes at the start and end if they match
if text.startswith(("'", '"')) and text.endswith(("'", '"')) and len(text) > 1:
# Remove the first and last character if they are both quotes
if text[0] == text[-1]:
text = text[1:-1]
return text