mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-01 18:35:00 -05:00
Merge pull request #992 from maiko/add_ingest_documents_script
Add data_ingestion.py script for memory pre-seeding
This commit is contained in:
70
scripts/data_ingestion.py
Normal file
70
scripts/data_ingestion.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import argparse
|
||||
import logging
|
||||
from config import Config
|
||||
from memory import get_memory
|
||||
from file_operations import ingest_file, search_files
|
||||
|
||||
cfg = Config()
|
||||
|
||||
|
||||
def configure_logging():
|
||||
logging.basicConfig(filename='log-ingestion.txt',
|
||||
filemode='a',
|
||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=logging.DEBUG)
|
||||
return logging.getLogger('AutoGPT-Ingestion')
|
||||
|
||||
|
||||
def ingest_directory(directory, memory, args):
|
||||
"""
|
||||
Ingest all files in a directory by calling the ingest_file function for each file.
|
||||
|
||||
:param directory: The directory containing the files to ingest
|
||||
:param memory: An object with an add() method to store the chunks in memory
|
||||
"""
|
||||
try:
|
||||
files = search_files(directory)
|
||||
for file in files:
|
||||
ingest_file(file, memory, args.max_length, args.overlap)
|
||||
except Exception as e:
|
||||
print(f"Error while ingesting directory '{directory}': {str(e)}")
|
||||
|
||||
|
||||
def main():
|
||||
logger = configure_logging()
|
||||
|
||||
parser = argparse.ArgumentParser(description="Ingest a file or a directory with multiple files into memory. Make sure to set your .env before running this script.")
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--file", type=str, help="The file to ingest.")
|
||||
group.add_argument("--dir", type=str, help="The directory containing the files to ingest.")
|
||||
parser.add_argument("--init", action='store_true', help="Init the memory and wipe its content (default: False)", default=False)
|
||||
parser.add_argument("--overlap", type=int, help="The overlap size between chunks when ingesting files (default: 200)", default=200)
|
||||
parser.add_argument("--max_length", type=int, help="The max_length of each chunk when ingesting files (default: 4000)", default=4000)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize memory
|
||||
memory = get_memory(cfg, init=args.init)
|
||||
print('Using memory of type: ' + memory.__class__.__name__)
|
||||
|
||||
if args.file:
|
||||
try:
|
||||
ingest_file(args.file, memory, args.max_length, args.overlap)
|
||||
print(f"File '{args.file}' ingested successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while ingesting file '{args.file}': {str(e)}")
|
||||
print(f"Error while ingesting file '{args.file}': {str(e)}")
|
||||
elif args.dir:
|
||||
try:
|
||||
ingest_directory(args.dir, memory, args)
|
||||
print(f"Directory '{args.dir}' ingested successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while ingesting directory '{args.dir}': {str(e)}")
|
||||
print(f"Error while ingesting directory '{args.dir}': {str(e)}")
|
||||
else:
|
||||
print("Please provide either a file path (--file) or a directory name (--dir) inside the auto_gpt_workspace directory as input.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -20,6 +20,29 @@ def safe_join(base, *paths):
|
||||
return norm_new_path
|
||||
|
||||
|
||||
def split_file(content, max_length=4000, overlap=0):
|
||||
"""
|
||||
Split text into chunks of a specified maximum length with a specified overlap
|
||||
between chunks.
|
||||
|
||||
:param text: The input text to be split into chunks
|
||||
:param max_length: The maximum length of each chunk, default is 4000 (about 1k token)
|
||||
:param overlap: The number of overlapping characters between chunks, default is no overlap
|
||||
:return: A generator yielding chunks of text
|
||||
"""
|
||||
start = 0
|
||||
content_length = len(content)
|
||||
|
||||
while start < content_length:
|
||||
end = start + max_length
|
||||
if end + overlap < content_length:
|
||||
chunk = content[start:end+overlap]
|
||||
else:
|
||||
chunk = content[start:content_length]
|
||||
yield chunk
|
||||
start += max_length - overlap
|
||||
|
||||
|
||||
def read_file(filename):
|
||||
"""Read a file and return the contents"""
|
||||
try:
|
||||
@@ -31,6 +54,37 @@ def read_file(filename):
|
||||
return "Error: " + str(e)
|
||||
|
||||
|
||||
def ingest_file(filename, memory, max_length=4000, overlap=200):
|
||||
"""
|
||||
Ingest a file by reading its content, splitting it into chunks with a specified
|
||||
maximum length and overlap, and adding the chunks to the memory storage.
|
||||
|
||||
:param filename: The name of the file to ingest
|
||||
:param memory: An object with an add() method to store the chunks in memory
|
||||
:param max_length: The maximum length of each chunk, default is 4000
|
||||
:param overlap: The number of overlapping characters between chunks, default is 200
|
||||
"""
|
||||
try:
|
||||
print(f"Working with file {filename}")
|
||||
content = read_file(filename)
|
||||
content_length = len(content)
|
||||
print(f"File length: {content_length} characters")
|
||||
|
||||
chunks = list(split_file(content, max_length=max_length, overlap=overlap))
|
||||
|
||||
num_chunks = len(chunks)
|
||||
for i, chunk in enumerate(chunks):
|
||||
print(f"Ingesting chunk {i + 1} / {num_chunks} into memory")
|
||||
memory_to_add = f"Filename: {filename}\n" \
|
||||
f"Content part#{i + 1}/{num_chunks}: {chunk}"
|
||||
|
||||
memory.add(memory_to_add)
|
||||
|
||||
print(f"Done ingesting {num_chunks} chunks from {filename}.")
|
||||
except Exception as e:
|
||||
print(f"Error while ingesting file '{filename}': {str(e)}")
|
||||
|
||||
|
||||
def write_to_file(filename, text):
|
||||
"""Write text to a file"""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user