mirror of
https://github.com/microsoft/autogen.git
synced 2026-04-20 03:02:16 -04:00
Support custom text formats and recursive (#496)
* Add custom text types and recursive * Add custom text types and recursive * Fix format * Update qdrant, Add pdf to unstructured * Use unstructed as the default text extractor if installed * Add tests for unstructured * Update tests env for unstructured * Fix error if last message is a function call, issue #569 * Remove csv, md and tsv from UNSTRUCTURED_FORMATS * Update docstring of docs_path * Update test for get_files_from_dir * Update docstring of custom_text_types * Fix missing search_string in update_context * Add custom_text_types to notebook example
This commit is contained in:
@@ -60,12 +60,34 @@ class TestRetrieveUtils:
|
||||
)
|
||||
|
||||
def test_get_files_from_dir(self):
|
||||
files = get_files_from_dir(test_dir)
|
||||
files = get_files_from_dir(test_dir, recursive=False)
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
pdf_file_path = os.path.join(test_dir, "example.pdf")
|
||||
txt_file_path = os.path.join(test_dir, "example.txt")
|
||||
files = get_files_from_dir([pdf_file_path, txt_file_path])
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
files = get_files_from_dir(
|
||||
[
|
||||
pdf_file_path,
|
||||
txt_file_path,
|
||||
os.path.join(test_dir, "..", "..", "website/docs"),
|
||||
"https://raw.githubusercontent.com/microsoft/autogen/main/README.md",
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
files = get_files_from_dir(
|
||||
[
|
||||
pdf_file_path,
|
||||
txt_file_path,
|
||||
os.path.join(test_dir, "..", "..", "website/docs"),
|
||||
"https://raw.githubusercontent.com/microsoft/autogen/main/README.md",
|
||||
],
|
||||
recursive=True,
|
||||
types=["pdf", "txt"],
|
||||
)
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
assert len(files) == 3
|
||||
|
||||
def test_is_url(self):
|
||||
assert is_url("https://www.example.com")
|
||||
@@ -168,6 +190,7 @@ class TestRetrieveUtils:
|
||||
collection_name="mytestcollection",
|
||||
custom_text_split_function=custom_text_split_function,
|
||||
get_or_create=True,
|
||||
recursive=False,
|
||||
)
|
||||
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
|
||||
assert (
|
||||
@@ -181,6 +204,7 @@ class TestRetrieveUtils:
|
||||
dir_path="./website/docs",
|
||||
client=client,
|
||||
collection_name="autogen-docs",
|
||||
custom_text_types=["txt", "md", "rtf", "rst"],
|
||||
get_or_create=True,
|
||||
)
|
||||
results = query_vector_db(
|
||||
|
||||
Reference in New Issue
Block a user