Support custom text formats and recursive (#496)

* Add custom text types and recursive

* Add custom text types and recursive

* Fix format

* Update qdrant, Add pdf to unstructured

* Use unstructed as the default text extractor if installed

* Add tests for unstructured

* Update tests env for unstructured

* Fix error if last message is a function call, issue #569

* Remove csv, md and tsv from UNSTRUCTURED_FORMATS

* Update docstring of docs_path

* Update test for get_files_from_dir

* Update docstring of custom_text_types

* Fix missing search_string in update_context

* Add custom_text_types to notebook example
This commit is contained in:
Li Jiang
2023-11-21 11:53:50 +08:00
committed by GitHub
parent ef1c3d3f7f
commit 07646d448c
7 changed files with 516 additions and 269 deletions

View File

@@ -60,12 +60,34 @@ class TestRetrieveUtils:
)
def test_get_files_from_dir(self):
files = get_files_from_dir(test_dir)
files = get_files_from_dir(test_dir, recursive=False)
assert all(os.path.isfile(file) for file in files)
pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt")
files = get_files_from_dir([pdf_file_path, txt_file_path])
assert all(os.path.isfile(file) for file in files)
files = get_files_from_dir(
[
pdf_file_path,
txt_file_path,
os.path.join(test_dir, "..", "..", "website/docs"),
"https://raw.githubusercontent.com/microsoft/autogen/main/README.md",
],
recursive=True,
)
assert all(os.path.isfile(file) for file in files)
files = get_files_from_dir(
[
pdf_file_path,
txt_file_path,
os.path.join(test_dir, "..", "..", "website/docs"),
"https://raw.githubusercontent.com/microsoft/autogen/main/README.md",
],
recursive=True,
types=["pdf", "txt"],
)
assert all(os.path.isfile(file) for file in files)
assert len(files) == 3
def test_is_url(self):
assert is_url("https://www.example.com")
@@ -168,6 +190,7 @@ class TestRetrieveUtils:
collection_name="mytestcollection",
custom_text_split_function=custom_text_split_function,
get_or_create=True,
recursive=False,
)
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
assert (
@@ -181,6 +204,7 @@ class TestRetrieveUtils:
dir_path="./website/docs",
client=client,
collection_name="autogen-docs",
custom_text_types=["txt", "md", "rtf", "rst"],
get_or_create=True,
)
results = query_vector_db(