From cd9f54c21e9bba1482c32462c0416c62a23f4771 Mon Sep 17 00:00:00 2001 From: Senko Rasic Date: Fri, 19 Jan 2024 18:01:32 -0800 Subject: [PATCH] Ignore files based on name(glob), size and whether they're binary files --- README.md | 2 +- docker-compose.yml | 2 +- pilot/.env.example | 2 +- pilot/const/common.py | 20 +++--- pilot/helpers/Project.py | 35 ++++------- pilot/helpers/cli.py | 66 +++++++------------- pilot/helpers/files.py | 32 ++++++---- pilot/helpers/test_Project.py | 2 +- pilot/test/helpers/test_files.py | 9 ++- pilot/test/utils/test_ignore.py | 102 +++++++++++++++++++++++++++++++ pilot/utils/ignore.py | 97 +++++++++++++++++++++++++++++ 11 files changed, 274 insertions(+), 95 deletions(-) create mode 100644 pilot/test/utils/test_ignore.py create mode 100644 pilot/utils/ignore.py diff --git a/README.md b/README.md index d8678e77..f2ccfb34 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ After you have Python and (optionally) PostgreSQL installed, follow these steps: - LLM Provider (OpenAI/Azure/Openrouter) - Your API key - database settings: SQLite/PostgreSQL (to change from SQLite to PostgreSQL, just set `DATABASE_TYPE=postgres`) - - optionally set IGNORE_FOLDERS for the folders which shouldn't be tracked by GPT Pilot in workspace, useful to ignore folders created by compilers (i.e. `IGNORE_FOLDERS=folder1,folder2,folder3`) + - optionally set IGNORE_PATHS for the folders which shouldn't be tracked by GPT Pilot in workspace, useful to ignore folders created by compilers (i.e. `IGNORE_PATHS=folder1,folder2,folder3`) 9. `python db_init.py` (initialize the database) 10. `python main.py` (start GPT Pilot) diff --git a/docker-compose.yml b/docker-compose.yml index d8277769..e570ef23 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: - DB_USER=pilot - DB_PASSWORD=pilot # Folders which shouldn't be tracked in workspace (useful to ignore folders created by compiler) - # IGNORE_FOLDERS=folder1,folder2 + # IGNORE_PATHS=folder1,folder2 volumes: - ~/gpt-pilot-workspace:/usr/src/app/workspace build: diff --git a/pilot/.env.example b/pilot/.env.example index b406f48a..856f83ba 100644 --- a/pilot/.env.example +++ b/pilot/.env.example @@ -16,7 +16,7 @@ MODEL_NAME=gpt-4-1106-preview MAX_TOKENS=8192 # Folders which shouldn't be tracked in workspace (useful to ignore folders created by compiler) -# IGNORE_FOLDERS=folder1,folder2 +# IGNORE_PATHS=folder1,folder2 # Database # DATABASE_TYPE=postgres diff --git a/pilot/const/common.py b/pilot/const/common.py index 0b5374b6..1b8ae02a 100644 --- a/pilot/const/common.py +++ b/pilot/const/common.py @@ -21,10 +21,7 @@ STEPS = [ 'finished' ] -additional_ignore_folders = os.environ.get('IGNORE_FOLDERS', '').split(',') - -# TODO: rename to IGNORE_PATHS as it also contains files -IGNORE_FOLDERS = [ +DEFAULT_IGNORE_PATHS = [ '.git', '.gpt-pilot', '.idea', @@ -36,7 +33,16 @@ IGNORE_FOLDERS = [ 'venv', 'dist', 'build', - 'target' -] + [folder for folder in additional_ignore_folders if folder] - + 'target', + "*.min.js", + "*.min.css", + "*.svg", + "*.csv", +] +IGNORE_PATHS = DEFAULT_IGNORE_PATHS + [ + folder for folder + in os.environ.get('IGNORE_PATHS', '').split(',') + if folder +] +IGNORE_SIZE_THRESHOLD = 102400 # 100K+ files are ignored by default PROMPT_DATA_TO_IGNORE = {'directory_tree', 'name'} diff --git a/pilot/helpers/Project.py b/pilot/helpers/Project.py index 125f748f..2ac665ae 100644 --- a/pilot/helpers/Project.py +++ b/pilot/helpers/Project.py @@ -8,7 +8,7 @@ import peewee from const.messages import CHECK_AND_CONTINUE, AFFIRMATIVE_ANSWERS, NEGATIVE_ANSWERS from utils.style import color_yellow_bold, color_cyan, color_white_bold, color_green -from const.common import IGNORE_FOLDERS, STEPS +from const.common import STEPS from database.database import delete_unconnected_steps_from, delete_all_app_development_data, update_app_status from const.ipc import MESSAGE_TYPE from prompts.prompts import ask_user @@ -28,6 +28,7 @@ from database.models.files import File from logger.logger import logger from utils.dot_gpt_pilot import DotGptPilot from utils.llm_connection import test_api_access +from utils.ignore import IgnoreMatcher from utils.telemetry import telemetry @@ -176,12 +177,7 @@ class Project: Returns: dict: The directory tree. """ - # files = {} - # if with_descriptions and False: - # files = File.select().where(File.app_id == self.args['app_id']) - # files = {snapshot.name: snapshot for snapshot in files} - # return build_directory_tree_with_descriptions(self.root_path, ignore=IGNORE_FOLDERS, files=files, add_descriptions=False) - return build_directory_tree(self.root_path, ignore=IGNORE_FOLDERS) + return build_directory_tree(self.root_path) def get_test_directory_tree(self): """ @@ -191,7 +187,7 @@ class Project: dict: The directory tree of tests. """ # TODO remove hardcoded path - return build_directory_tree(self.root_path + '/tests', ignore=IGNORE_FOLDERS) + return build_directory_tree(self.root_path + '/tests') def get_all_coded_files(self): """ @@ -209,18 +205,7 @@ class Project: ) ) - files = self.get_files([file.path + '/' + file.name for file in files]) - - # Don't send contents of binary files - for file in files: - if not isinstance(file["content"], str): - file["content"] = f"<>" - - # TODO temoprary fix to eliminate files that are not in the project - files = [file for file in files if file['content'] != ''] - # TODO END - - return files + return self.get_files([file.path + '/' + file.name for file in files]) def get_files(self, files): """ @@ -232,6 +217,7 @@ class Project: Returns: list: A list of files with content. """ + matcher = IgnoreMatcher(root_path=self.root_path) files_with_content = [] for file_path in files: try: @@ -239,9 +225,12 @@ class Project: _, full_path = self.get_full_file_path(file_path, file_path) file_data = get_file_contents(full_path, self.root_path) except ValueError: + full_path = None file_data = {"path": file_path, "name": os.path.basename(file_path), "content": ''} - files_with_content.append(file_data) + if full_path and file_data["content"] != "" and not matcher.ignore(full_path): + files_with_content.append(file_data) + return files_with_content def find_input_required_lines(self, file_content): @@ -395,7 +384,7 @@ class Project: def save_files_snapshot(self, development_step_id): - files = get_directory_contents(self.root_path, ignore=IGNORE_FOLDERS) + files = get_directory_contents(self.root_path) development_step, created = DevelopmentSteps.get_or_create(id=development_step_id) total_files = 0 @@ -431,7 +420,7 @@ class Project: development_step = DevelopmentSteps.get(DevelopmentSteps.id == development_step_id) file_snapshots = FileSnapshot.select().where(FileSnapshot.development_step == development_step) - clear_directory(self.root_path, IGNORE_FOLDERS + self.files) + clear_directory(self.root_path, ignore=self.files) for file_snapshot in file_snapshots: update_file(file_snapshot.file.full_path, file_snapshot.content, project=self) if file_snapshot.file.full_path not in self.files: diff --git a/pilot/helpers/cli.py b/pilot/helpers/cli.py index 83c458aa..475ae2f7 100644 --- a/pilot/helpers/cli.py +++ b/pilot/helpers/cli.py @@ -10,6 +10,7 @@ from typing import Dict, Union from logger.logger import logger from utils.style import color_yellow, color_green, color_red, color_yellow_bold +from utils.ignore import IgnoreMatcher from database.database import get_saved_command_run, save_command_run from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError from helpers.exceptions.TokenLimitError import TokenLimitError @@ -340,23 +341,24 @@ def check_if_command_successful(convo, command, cli_response, response, exit_cod return response -def build_directory_tree(path, prefix='', is_root=True, ignore=None): +def build_directory_tree(path, prefix='', root_path=None) -> str: """Build the directory tree structure in a simplified format. - Args: - - path: The starting directory path. - - prefix: Prefix for the current item, used for recursion. - - is_root: Flag to indicate if the current item is the root directory. - - ignore: a list of directories to ignore - - Returns: - - A string representation of the directory tree. + :param path: The starting directory path. + :param prefix: Prefix for the current item, used for recursion. + :param root_path: The root directory path. + :return: A string representation of the directory tree. """ output = "" indent = ' ' + if root_path is None: + root_path = path + + matcher = IgnoreMatcher(root_path=root_path) + if os.path.isdir(path): - if is_root: + if root_path == path: output += '/' else: dir_name = os.path.basename(path) @@ -364,8 +366,16 @@ def build_directory_tree(path, prefix='', is_root=True, ignore=None): # List items in the directory items = os.listdir(path) - dirs = [item for item in items if os.path.isdir(os.path.join(path, item)) and item not in ignore] - files = [item for item in items if os.path.isfile(os.path.join(path, item))] + dirs = [] + files = [] + for item in items: + item_path = os.path.join(path, item) + if matcher.ignore(item_path): + continue + if os.path.isdir(item_path): + dirs.append(item) + elif os.path.isfile(item_path): + files.append(item) dirs.sort() files.sort() @@ -374,7 +384,7 @@ def build_directory_tree(path, prefix='', is_root=True, ignore=None): for index, dir_item in enumerate(dirs): item_path = os.path.join(path, dir_item) new_prefix = prefix + indent # Updated prefix for recursion - output += build_directory_tree(item_path, new_prefix, is_root=False, ignore=ignore) + output += build_directory_tree(item_path, new_prefix, root_path) if files: output += f"{prefix} {', '.join(files)}\n" @@ -387,36 +397,6 @@ def build_directory_tree(path, prefix='', is_root=True, ignore=None): return output -def res_for_build_directory_tree(path, files=None): - return ' - ' + files[os.path.basename(path)].description + ' ' if files and os.path.basename(path) in files else '' - - -def build_directory_tree_with_descriptions(path, prefix="", ignore=None, is_last=False, files=None): - """Build the directory tree structure in tree-like format. - Args: - - path: The starting directory path. - - prefix: Prefix for the current item, used for recursion. - - ignore: List of directory names to ignore. - - is_last: Flag to indicate if the current item is the last in its parent directory. - Returns: - - A string representation of the directory tree. - """ - ignore |= [] - if os.path.basename(path) in ignore: - return "" - output = "" - indent = '| ' if not is_last else ' ' - # It's a directory, add its name to the output and then recurse into it - output += prefix + f"|-- {os.path.basename(path)}{res_for_build_directory_tree(path, files)}/\n" - if os.path.isdir(path): - # List items in the directory - items = os.listdir(path) - for index, item in enumerate(items): - item_path = os.path.join(path, item) - output += build_directory_tree(item_path, prefix + indent, ignore, index == len(items) - 1, files) - return output - - def execute_command_and_check_cli_response(convo, command: dict): """ Execute a command and check its CLI response. diff --git a/pilot/helpers/files.py b/pilot/helpers/files.py index 50b4fe61..5308878c 100644 --- a/pilot/helpers/files.py +++ b/pilot/helpers/files.py @@ -3,7 +3,7 @@ import os from typing import Optional, Union from utils.style import color_green - +from utils.ignore import IgnoreMatcher def update_file(path: str, new_content: Union[str, bytes], project=None): """ @@ -87,7 +87,8 @@ def get_file_contents( def get_directory_contents( - directory: str, ignore: Optional[list[str]] = None + directory: str, + ignore: Optional[list[str]] = None, ) -> list[dict[str, Union[str, bytes]]]: """ Get the content of all files in the given directory. @@ -101,19 +102,22 @@ def get_directory_contents( """ return_array = [] - if ignore is None: - ignore = [] + matcher = IgnoreMatcher(ignore, root_path=directory) # TODO: Convert to use pathlib.Path.walk() for dpath, dirs, files in os.walk(directory): # In-place update of dirs so that os.walk() doesn't traverse them - dirs[:] = [d for d in dirs if d not in ignore] + dirs[:] = [ + d for d in dirs + if not matcher.ignore(os.path.join(dpath, d)) + ] for file in files: - if file in ignore: + full_path = os.path.join(dpath, file) + if matcher.ignore(full_path): continue - return_array.append(get_file_contents(os.path.join(dpath, file), directory)) + return_array.append(get_file_contents(full_path, directory)) return return_array @@ -125,20 +129,22 @@ def clear_directory(directory: str, ignore: Optional[list[str]] = None): :param dir_path: Full path to the directory to clear :param ignore: List of files or folders to ignore (optional) """ - if ignore is None: - ignore = [] + matcher = IgnoreMatcher(ignore, root_path=directory) # TODO: Convert to use pathlib.Path.walk() for dpath, dirs, files in os.walk(directory, topdown=True): # In-place update of dirs so that os.walk() doesn't traverse them - dirs[:] = [d for d in dirs if d not in ignore] + dirs[:] = [ + d for d in dirs + if not matcher.ignore(os.path.join(dpath, d)) + ] for file in files: - if file in ignore or os.path.join(directory, file) in ignore: + full_path = os.path.join(dpath, file) + if matcher.ignore(full_path): continue - path = os.path.join(dpath, file) - os.remove(path) + os.remove(full_path) # Delete empty subdirectories not in ignore list for d in dirs: diff --git a/pilot/helpers/test_Project.py b/pilot/helpers/test_Project.py index 03d36629..1aa79aaf 100644 --- a/pilot/helpers/test_Project.py +++ b/pilot/helpers/test_Project.py @@ -363,7 +363,7 @@ class TestProjectFileLists: 'user_review_goal': 'Test User Review Goal', }] - # with directories including common.IGNORE_FOLDERS + # with directories including common.IGNORE_PATHS src = os.path.join(project.root_path, 'src') foo = os.path.join(project.root_path, 'src/foo') files_no_folders = os.path.join(foo, 'files_no_folders') diff --git a/pilot/test/helpers/test_files.py b/pilot/test/helpers/test_files.py index b3649b32..d84d0d74 100644 --- a/pilot/test/helpers/test_files.py +++ b/pilot/test/helpers/test_files.py @@ -147,14 +147,13 @@ def test_get_directory_contents_live(): assert isinstance(this_file["content"], str) assert "test_get_directory_contents_live()" in this_file["content"] - # Check that the Python cache was loaded as a binary file - print("FILES", [(f["path"], f["name"]) for f in files]) - pycache_file = [ + # Check that the binary file was ignored + image_files = [ f for f in files if f["path"] == "helpers" and f["name"] == "testlogo.png" - ][0] - assert isinstance(pycache_file["content"], bytes) + ] + assert image_files == [] # Check that the ignore list works assert all(file["name"] != "__init__.py" for file in files) diff --git a/pilot/test/utils/test_ignore.py b/pilot/test/utils/test_ignore.py new file mode 100644 index 00000000..9608d4d9 --- /dev/null +++ b/pilot/test/utils/test_ignore.py @@ -0,0 +1,102 @@ +from unittest.mock import patch +import pytest +from tempfile import TemporaryDirectory + +from utils.ignore import IgnoreMatcher +from os.path import sep, join, dirname + +@pytest.mark.parametrize( + ("path", "expected"), + [ + (".git", True), + (".gpt-pilot", True), + (".idea", True), + (".vscode", True), + (".DS_Store", True), + (join("subdirectory", ".DS_Store"), True), + ("__pycache__", True), + (join("subdirectory", "__pycache__"), True), + ("node_modules", True), + (join("subdirectory", "node_modules"), True), + ("package-lock.json", True), + ("venv", True), + ("dist", True), + ("build", True), + ("target", True), + (".gitignore", False), + ("server.js", False), + (join(dirname(__file__), "node_modules"), True), + (join(dirname(__file__), "subdirectory", "node_modules"), True), + ] +) +def test_default_ignore(path, expected): + matcher = IgnoreMatcher(root_path=dirname(__file__)) + assert matcher.ignore(path) == expected + + +@pytest.mark.parametrize( + ("ignore", "path", "expected"), + [ + ("*.py[co]", "test.pyc", True), + ("*.py[co]", "subdir/test.pyo", True), + ("*.py[co]", "test.py", False), + ("*.min.js", f"public{sep}js{sep}script.min.js", True), + ("*.min.js", f"public{sep}js{sep}min.js", False), + ] +) +def test_additional_ignore(ignore, path, expected): + matcher = IgnoreMatcher([ignore]) + assert matcher.ignore(path) == expected + + +@pytest.mark.parametrize( + ("ignore", "path", "expected"), + [ + ("jquery.js", "jquery.js", True), + ("jquery.js", f"otherdir{sep}jquery.js", True), + ("jquery.js", f"{sep}test{sep}jquery.js", True), + ] +) +def test_full_path(ignore, path, expected): + matcher = IgnoreMatcher([ignore], root_path=f"{sep}test") + assert matcher.ignore(path) == expected + + +@pytest.mark.parametrize( + ("size", "expected"), + [ + (1024*1024, True), # 1MB + (102400, False), # 100KB + ] +) +@patch("utils.ignore.os.path.isfile") +@patch("utils.ignore.os.path.getsize") +def test_ignore_large_files(mock_getsize, mock_isfile, size, expected): + mock_isfile.return_value = True + mock_getsize.return_value = size + matcher = IgnoreMatcher(root_path=f"{sep}test") + + with patch.object(matcher, "is_binary", return_value=False): + assert matcher.ignore("fakefile.txt") is expected + + mock_isfile.assert_called_once() + mock_getsize.assert_called_once_with(f"{sep}test{sep}fakefile.txt") + + +@pytest.mark.parametrize( + ("content", "expected"), + [ + (("hello world ŠĐŽČĆ").encode("utf-8"), False), # text + (b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a\x00\x00\x00\x0d\x49\x48\x44\x52", True), # image + ] +) +def test_ignore_binary_files(content, expected): + with TemporaryDirectory() as tmpdir: + path = join(tmpdir, "testfile.txt") + with open(path, "wb") as fp: + fp.write(content) + + matcher = IgnoreMatcher(root_path=tmpdir) + # Check both relative and absolute paths + assert matcher.ignore("testfile.txt") is expected + assert matcher.ignore(path) is expected diff --git a/pilot/utils/ignore.py b/pilot/utils/ignore.py new file mode 100644 index 00000000..a0282643 --- /dev/null +++ b/pilot/utils/ignore.py @@ -0,0 +1,97 @@ +from fnmatch import fnmatch +import os.path +from typing import Optional + +from const.common import IGNORE_PATHS, IGNORE_SIZE_THRESHOLD + + +class IgnoreMatcher: + def __init__(self, + ignore_paths: Optional[list[str]] = None, + *, + root_path: Optional[None] = None, + ignore_binaries: bool = True, + ignore_large_files: bool = True, + ): + """ + Initialize the IgnoreMatcher object. + + The passed paths (optional) are *added* to the list of + ignore paths from `const.common.IGNORE_PATHS`. + + :param ignore_paths: List of paths to ignore (optional) + """ + if ignore_paths is None: + ignore_paths = [] + + self.ignore_paths = ignore_paths + IGNORE_PATHS + self.ignore_binaries = ignore_binaries + self.ignore_large_files = ignore_large_files + self.root_path = root_path + + def ignore(self, path: str) -> bool: + """ + Check if the given path matches any of the ignore patterns. + + Specified path can be either the full path, or a relative path + (if root_path was set in the constructor). + + :param path: Path to the file or directory to check + :return: True if the path matches any of the ignore patterns, False otherwise + """ + + # Turn into absolute (full) path + if self.root_path and not path.startswith(self.root_path): + path = os.path.join(self.root_path, path) + + if self.is_in_ignore_list(path): + return True + + if self.ignore_large_files and self.is_large_file(path): + return True + + if self.ignore_binaries and self.is_binary(path): + return True + + return False + + def is_in_ignore_list(self, path: str) -> bool: + """ + Check if the given path matches any of the ignore patterns. + + :param path: The path to the file or directory to check + :return: True if the path matches any of the ignore patterns, False otherwise. + """ + name = os.path.basename(path) + for pattern in self.ignore_paths: + if fnmatch(name, pattern): + return True + return False + + def is_large_file(self, path: str) -> bool: + """ + Check if the given file is larger than the threshold. + + :param path: FULL path to the file to check. + :return: True if the file is larger than the threshold, False otherwise. + """ + if not os.path.isfile(path): + return False + + return bool(os.path.getsize(path) > IGNORE_SIZE_THRESHOLD) + + def is_binary(self, path: str) -> bool: + """ + Check if the given file is binary. + + :param path: FULL path to the file to check. + :return: True if the file is binary, False otherwise. + """ + if not os.path.isfile(path): + return False + + try: + open(path, "r", encoding="utf-8").read(128*1024) + return False + except UnicodeDecodeError: + return True