Ignore files based on name(glob), size and whether they're binary files

This commit is contained in:
Senko Rasic
2024-01-19 18:01:32 -08:00
parent 01ae996afe
commit cd9f54c21e
11 changed files with 274 additions and 95 deletions

View File

@@ -85,7 +85,7 @@ After you have Python and (optionally) PostgreSQL installed, follow these steps:
- LLM Provider (OpenAI/Azure/Openrouter)
- Your API key
- database settings: SQLite/PostgreSQL (to change from SQLite to PostgreSQL, just set `DATABASE_TYPE=postgres`)
- optionally set IGNORE_FOLDERS for the folders which shouldn't be tracked by GPT Pilot in workspace, useful to ignore folders created by compilers (i.e. `IGNORE_FOLDERS=folder1,folder2,folder3`)
- optionally set IGNORE_PATHS for the folders which shouldn't be tracked by GPT Pilot in workspace, useful to ignore folders created by compilers (i.e. `IGNORE_PATHS=folder1,folder2,folder3`)
9. `python db_init.py` (initialize the database)
10. `python main.py` (start GPT Pilot)

View File

@@ -18,7 +18,7 @@ services:
- DB_USER=pilot
- DB_PASSWORD=pilot
# Folders which shouldn't be tracked in workspace (useful to ignore folders created by compiler)
# IGNORE_FOLDERS=folder1,folder2
# IGNORE_PATHS=folder1,folder2
volumes:
- ~/gpt-pilot-workspace:/usr/src/app/workspace
build:

View File

@@ -16,7 +16,7 @@ MODEL_NAME=gpt-4-1106-preview
MAX_TOKENS=8192
# Folders which shouldn't be tracked in workspace (useful to ignore folders created by compiler)
# IGNORE_FOLDERS=folder1,folder2
# IGNORE_PATHS=folder1,folder2
# Database
# DATABASE_TYPE=postgres

View File

@@ -21,10 +21,7 @@ STEPS = [
'finished'
]
additional_ignore_folders = os.environ.get('IGNORE_FOLDERS', '').split(',')
# TODO: rename to IGNORE_PATHS as it also contains files
IGNORE_FOLDERS = [
DEFAULT_IGNORE_PATHS = [
'.git',
'.gpt-pilot',
'.idea',
@@ -36,7 +33,16 @@ IGNORE_FOLDERS = [
'venv',
'dist',
'build',
'target'
] + [folder for folder in additional_ignore_folders if folder]
'target',
"*.min.js",
"*.min.css",
"*.svg",
"*.csv",
]
IGNORE_PATHS = DEFAULT_IGNORE_PATHS + [
folder for folder
in os.environ.get('IGNORE_PATHS', '').split(',')
if folder
]
IGNORE_SIZE_THRESHOLD = 102400 # 100K+ files are ignored by default
PROMPT_DATA_TO_IGNORE = {'directory_tree', 'name'}

View File

@@ -8,7 +8,7 @@ import peewee
from const.messages import CHECK_AND_CONTINUE, AFFIRMATIVE_ANSWERS, NEGATIVE_ANSWERS
from utils.style import color_yellow_bold, color_cyan, color_white_bold, color_green
from const.common import IGNORE_FOLDERS, STEPS
from const.common import STEPS
from database.database import delete_unconnected_steps_from, delete_all_app_development_data, update_app_status
from const.ipc import MESSAGE_TYPE
from prompts.prompts import ask_user
@@ -28,6 +28,7 @@ from database.models.files import File
from logger.logger import logger
from utils.dot_gpt_pilot import DotGptPilot
from utils.llm_connection import test_api_access
from utils.ignore import IgnoreMatcher
from utils.telemetry import telemetry
@@ -176,12 +177,7 @@ class Project:
Returns:
dict: The directory tree.
"""
# files = {}
# if with_descriptions and False:
# files = File.select().where(File.app_id == self.args['app_id'])
# files = {snapshot.name: snapshot for snapshot in files}
# return build_directory_tree_with_descriptions(self.root_path, ignore=IGNORE_FOLDERS, files=files, add_descriptions=False)
return build_directory_tree(self.root_path, ignore=IGNORE_FOLDERS)
return build_directory_tree(self.root_path)
def get_test_directory_tree(self):
"""
@@ -191,7 +187,7 @@ class Project:
dict: The directory tree of tests.
"""
# TODO remove hardcoded path
return build_directory_tree(self.root_path + '/tests', ignore=IGNORE_FOLDERS)
return build_directory_tree(self.root_path + '/tests')
def get_all_coded_files(self):
"""
@@ -209,18 +205,7 @@ class Project:
)
)
files = self.get_files([file.path + '/' + file.name for file in files])
# Don't send contents of binary files
for file in files:
if not isinstance(file["content"], str):
file["content"] = f"<<binary file, {len(file['content'])} bytes>>"
# TODO temoprary fix to eliminate files that are not in the project
files = [file for file in files if file['content'] != '']
# TODO END
return files
return self.get_files([file.path + '/' + file.name for file in files])
def get_files(self, files):
"""
@@ -232,6 +217,7 @@ class Project:
Returns:
list: A list of files with content.
"""
matcher = IgnoreMatcher(root_path=self.root_path)
files_with_content = []
for file_path in files:
try:
@@ -239,9 +225,12 @@ class Project:
_, full_path = self.get_full_file_path(file_path, file_path)
file_data = get_file_contents(full_path, self.root_path)
except ValueError:
full_path = None
file_data = {"path": file_path, "name": os.path.basename(file_path), "content": ''}
files_with_content.append(file_data)
if full_path and file_data["content"] != "" and not matcher.ignore(full_path):
files_with_content.append(file_data)
return files_with_content
def find_input_required_lines(self, file_content):
@@ -395,7 +384,7 @@ class Project:
def save_files_snapshot(self, development_step_id):
files = get_directory_contents(self.root_path, ignore=IGNORE_FOLDERS)
files = get_directory_contents(self.root_path)
development_step, created = DevelopmentSteps.get_or_create(id=development_step_id)
total_files = 0
@@ -431,7 +420,7 @@ class Project:
development_step = DevelopmentSteps.get(DevelopmentSteps.id == development_step_id)
file_snapshots = FileSnapshot.select().where(FileSnapshot.development_step == development_step)
clear_directory(self.root_path, IGNORE_FOLDERS + self.files)
clear_directory(self.root_path, ignore=self.files)
for file_snapshot in file_snapshots:
update_file(file_snapshot.file.full_path, file_snapshot.content, project=self)
if file_snapshot.file.full_path not in self.files:

View File

@@ -10,6 +10,7 @@ from typing import Dict, Union
from logger.logger import logger
from utils.style import color_yellow, color_green, color_red, color_yellow_bold
from utils.ignore import IgnoreMatcher
from database.database import get_saved_command_run, save_command_run
from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError
from helpers.exceptions.TokenLimitError import TokenLimitError
@@ -340,23 +341,24 @@ def check_if_command_successful(convo, command, cli_response, response, exit_cod
return response
def build_directory_tree(path, prefix='', is_root=True, ignore=None):
def build_directory_tree(path, prefix='', root_path=None) -> str:
"""Build the directory tree structure in a simplified format.
Args:
- path: The starting directory path.
- prefix: Prefix for the current item, used for recursion.
- is_root: Flag to indicate if the current item is the root directory.
- ignore: a list of directories to ignore
Returns:
- A string representation of the directory tree.
:param path: The starting directory path.
:param prefix: Prefix for the current item, used for recursion.
:param root_path: The root directory path.
:return: A string representation of the directory tree.
"""
output = ""
indent = ' '
if root_path is None:
root_path = path
matcher = IgnoreMatcher(root_path=root_path)
if os.path.isdir(path):
if is_root:
if root_path == path:
output += '/'
else:
dir_name = os.path.basename(path)
@@ -364,8 +366,16 @@ def build_directory_tree(path, prefix='', is_root=True, ignore=None):
# List items in the directory
items = os.listdir(path)
dirs = [item for item in items if os.path.isdir(os.path.join(path, item)) and item not in ignore]
files = [item for item in items if os.path.isfile(os.path.join(path, item))]
dirs = []
files = []
for item in items:
item_path = os.path.join(path, item)
if matcher.ignore(item_path):
continue
if os.path.isdir(item_path):
dirs.append(item)
elif os.path.isfile(item_path):
files.append(item)
dirs.sort()
files.sort()
@@ -374,7 +384,7 @@ def build_directory_tree(path, prefix='', is_root=True, ignore=None):
for index, dir_item in enumerate(dirs):
item_path = os.path.join(path, dir_item)
new_prefix = prefix + indent # Updated prefix for recursion
output += build_directory_tree(item_path, new_prefix, is_root=False, ignore=ignore)
output += build_directory_tree(item_path, new_prefix, root_path)
if files:
output += f"{prefix} {', '.join(files)}\n"
@@ -387,36 +397,6 @@ def build_directory_tree(path, prefix='', is_root=True, ignore=None):
return output
def res_for_build_directory_tree(path, files=None):
return ' - ' + files[os.path.basename(path)].description + ' ' if files and os.path.basename(path) in files else ''
def build_directory_tree_with_descriptions(path, prefix="", ignore=None, is_last=False, files=None):
"""Build the directory tree structure in tree-like format.
Args:
- path: The starting directory path.
- prefix: Prefix for the current item, used for recursion.
- ignore: List of directory names to ignore.
- is_last: Flag to indicate if the current item is the last in its parent directory.
Returns:
- A string representation of the directory tree.
"""
ignore |= []
if os.path.basename(path) in ignore:
return ""
output = ""
indent = '| ' if not is_last else ' '
# It's a directory, add its name to the output and then recurse into it
output += prefix + f"|-- {os.path.basename(path)}{res_for_build_directory_tree(path, files)}/\n"
if os.path.isdir(path):
# List items in the directory
items = os.listdir(path)
for index, item in enumerate(items):
item_path = os.path.join(path, item)
output += build_directory_tree(item_path, prefix + indent, ignore, index == len(items) - 1, files)
return output
def execute_command_and_check_cli_response(convo, command: dict):
"""
Execute a command and check its CLI response.

View File

@@ -3,7 +3,7 @@ import os
from typing import Optional, Union
from utils.style import color_green
from utils.ignore import IgnoreMatcher
def update_file(path: str, new_content: Union[str, bytes], project=None):
"""
@@ -87,7 +87,8 @@ def get_file_contents(
def get_directory_contents(
directory: str, ignore: Optional[list[str]] = None
directory: str,
ignore: Optional[list[str]] = None,
) -> list[dict[str, Union[str, bytes]]]:
"""
Get the content of all files in the given directory.
@@ -101,19 +102,22 @@ def get_directory_contents(
"""
return_array = []
if ignore is None:
ignore = []
matcher = IgnoreMatcher(ignore, root_path=directory)
# TODO: Convert to use pathlib.Path.walk()
for dpath, dirs, files in os.walk(directory):
# In-place update of dirs so that os.walk() doesn't traverse them
dirs[:] = [d for d in dirs if d not in ignore]
dirs[:] = [
d for d in dirs
if not matcher.ignore(os.path.join(dpath, d))
]
for file in files:
if file in ignore:
full_path = os.path.join(dpath, file)
if matcher.ignore(full_path):
continue
return_array.append(get_file_contents(os.path.join(dpath, file), directory))
return_array.append(get_file_contents(full_path, directory))
return return_array
@@ -125,20 +129,22 @@ def clear_directory(directory: str, ignore: Optional[list[str]] = None):
:param dir_path: Full path to the directory to clear
:param ignore: List of files or folders to ignore (optional)
"""
if ignore is None:
ignore = []
matcher = IgnoreMatcher(ignore, root_path=directory)
# TODO: Convert to use pathlib.Path.walk()
for dpath, dirs, files in os.walk(directory, topdown=True):
# In-place update of dirs so that os.walk() doesn't traverse them
dirs[:] = [d for d in dirs if d not in ignore]
dirs[:] = [
d for d in dirs
if not matcher.ignore(os.path.join(dpath, d))
]
for file in files:
if file in ignore or os.path.join(directory, file) in ignore:
full_path = os.path.join(dpath, file)
if matcher.ignore(full_path):
continue
path = os.path.join(dpath, file)
os.remove(path)
os.remove(full_path)
# Delete empty subdirectories not in ignore list
for d in dirs:

View File

@@ -363,7 +363,7 @@ class TestProjectFileLists:
'user_review_goal': 'Test User Review Goal',
}]
# with directories including common.IGNORE_FOLDERS
# with directories including common.IGNORE_PATHS
src = os.path.join(project.root_path, 'src')
foo = os.path.join(project.root_path, 'src/foo')
files_no_folders = os.path.join(foo, 'files_no_folders')

View File

@@ -147,14 +147,13 @@ def test_get_directory_contents_live():
assert isinstance(this_file["content"], str)
assert "test_get_directory_contents_live()" in this_file["content"]
# Check that the Python cache was loaded as a binary file
print("FILES", [(f["path"], f["name"]) for f in files])
pycache_file = [
# Check that the binary file was ignored
image_files = [
f
for f in files
if f["path"] == "helpers" and f["name"] == "testlogo.png"
][0]
assert isinstance(pycache_file["content"], bytes)
]
assert image_files == []
# Check that the ignore list works
assert all(file["name"] != "__init__.py" for file in files)

View File

@@ -0,0 +1,102 @@
from unittest.mock import patch
import pytest
from tempfile import TemporaryDirectory
from utils.ignore import IgnoreMatcher
from os.path import sep, join, dirname
@pytest.mark.parametrize(
("path", "expected"),
[
(".git", True),
(".gpt-pilot", True),
(".idea", True),
(".vscode", True),
(".DS_Store", True),
(join("subdirectory", ".DS_Store"), True),
("__pycache__", True),
(join("subdirectory", "__pycache__"), True),
("node_modules", True),
(join("subdirectory", "node_modules"), True),
("package-lock.json", True),
("venv", True),
("dist", True),
("build", True),
("target", True),
(".gitignore", False),
("server.js", False),
(join(dirname(__file__), "node_modules"), True),
(join(dirname(__file__), "subdirectory", "node_modules"), True),
]
)
def test_default_ignore(path, expected):
matcher = IgnoreMatcher(root_path=dirname(__file__))
assert matcher.ignore(path) == expected
@pytest.mark.parametrize(
("ignore", "path", "expected"),
[
("*.py[co]", "test.pyc", True),
("*.py[co]", "subdir/test.pyo", True),
("*.py[co]", "test.py", False),
("*.min.js", f"public{sep}js{sep}script.min.js", True),
("*.min.js", f"public{sep}js{sep}min.js", False),
]
)
def test_additional_ignore(ignore, path, expected):
matcher = IgnoreMatcher([ignore])
assert matcher.ignore(path) == expected
@pytest.mark.parametrize(
("ignore", "path", "expected"),
[
("jquery.js", "jquery.js", True),
("jquery.js", f"otherdir{sep}jquery.js", True),
("jquery.js", f"{sep}test{sep}jquery.js", True),
]
)
def test_full_path(ignore, path, expected):
matcher = IgnoreMatcher([ignore], root_path=f"{sep}test")
assert matcher.ignore(path) == expected
@pytest.mark.parametrize(
("size", "expected"),
[
(1024*1024, True), # 1MB
(102400, False), # 100KB
]
)
@patch("utils.ignore.os.path.isfile")
@patch("utils.ignore.os.path.getsize")
def test_ignore_large_files(mock_getsize, mock_isfile, size, expected):
mock_isfile.return_value = True
mock_getsize.return_value = size
matcher = IgnoreMatcher(root_path=f"{sep}test")
with patch.object(matcher, "is_binary", return_value=False):
assert matcher.ignore("fakefile.txt") is expected
mock_isfile.assert_called_once()
mock_getsize.assert_called_once_with(f"{sep}test{sep}fakefile.txt")
@pytest.mark.parametrize(
("content", "expected"),
[
(("hello world ŠĐŽČĆ").encode("utf-8"), False), # text
(b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a\x00\x00\x00\x0d\x49\x48\x44\x52", True), # image
]
)
def test_ignore_binary_files(content, expected):
with TemporaryDirectory() as tmpdir:
path = join(tmpdir, "testfile.txt")
with open(path, "wb") as fp:
fp.write(content)
matcher = IgnoreMatcher(root_path=tmpdir)
# Check both relative and absolute paths
assert matcher.ignore("testfile.txt") is expected
assert matcher.ignore(path) is expected

97
pilot/utils/ignore.py Normal file
View File

@@ -0,0 +1,97 @@
from fnmatch import fnmatch
import os.path
from typing import Optional
from const.common import IGNORE_PATHS, IGNORE_SIZE_THRESHOLD
class IgnoreMatcher:
def __init__(self,
ignore_paths: Optional[list[str]] = None,
*,
root_path: Optional[None] = None,
ignore_binaries: bool = True,
ignore_large_files: bool = True,
):
"""
Initialize the IgnoreMatcher object.
The passed paths (optional) are *added* to the list of
ignore paths from `const.common.IGNORE_PATHS`.
:param ignore_paths: List of paths to ignore (optional)
"""
if ignore_paths is None:
ignore_paths = []
self.ignore_paths = ignore_paths + IGNORE_PATHS
self.ignore_binaries = ignore_binaries
self.ignore_large_files = ignore_large_files
self.root_path = root_path
def ignore(self, path: str) -> bool:
"""
Check if the given path matches any of the ignore patterns.
Specified path can be either the full path, or a relative path
(if root_path was set in the constructor).
:param path: Path to the file or directory to check
:return: True if the path matches any of the ignore patterns, False otherwise
"""
# Turn into absolute (full) path
if self.root_path and not path.startswith(self.root_path):
path = os.path.join(self.root_path, path)
if self.is_in_ignore_list(path):
return True
if self.ignore_large_files and self.is_large_file(path):
return True
if self.ignore_binaries and self.is_binary(path):
return True
return False
def is_in_ignore_list(self, path: str) -> bool:
"""
Check if the given path matches any of the ignore patterns.
:param path: The path to the file or directory to check
:return: True if the path matches any of the ignore patterns, False otherwise.
"""
name = os.path.basename(path)
for pattern in self.ignore_paths:
if fnmatch(name, pattern):
return True
return False
def is_large_file(self, path: str) -> bool:
"""
Check if the given file is larger than the threshold.
:param path: FULL path to the file to check.
:return: True if the file is larger than the threshold, False otherwise.
"""
if not os.path.isfile(path):
return False
return bool(os.path.getsize(path) > IGNORE_SIZE_THRESHOLD)
def is_binary(self, path: str) -> bool:
"""
Check if the given file is binary.
:param path: FULL path to the file to check.
:return: True if the file is binary, False otherwise.
"""
if not os.path.isfile(path):
return False
try:
open(path, "r", encoding="utf-8").read(128*1024)
return False
except UnicodeDecodeError:
return True