Add support to unstructrued (microsoft#501)

* Add support to unstructrued * Fix tests * Add test and documents * Fix tests * Fix tests * Test unstructured on linux and mac
hiunlink · Nov 5, 2023 · f052977 · f052977
1 parent 0dd0fc5
commit f052977
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 11 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -42,6 +42,10 @@ jobs:
           python -c "import autogen"
           pip install -e. pytest
           pip uninstall -y openai
+      - name: Install unstructured if not windows
+        if: matrix.os != 'windows-2019'
+        run: |
+          pip install "unstructured[all-docs]"
       - name: Test with pytest
         if: matrix.python-version != '3.10'
         run: |

diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py
@@ -15,6 +15,13 @@
 import pypdf
 from autogen.token_count_utils import count_token
 
+try:
+    from unstructured.partition.auto import partition
+
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False
+
 logger = logging.getLogger(__name__)
 TEXT_FORMATS = [
     "txt",
@@ -33,6 +40,10 @@
     "yml",
     "pdf",
 ]
+UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
+if HAS_UNSTRUCTURED:
+    TEXT_FORMATS += UNSTRUCTURED_FORMATS
+    TEXT_FORMATS = list(set(TEXT_FORMATS))
 VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
 
 
@@ -123,7 +134,10 @@ def split_files_to_chunks(
         _, file_extension = os.path.splitext(file)
         file_extension = file_extension.lower()
 
-        if file_extension == ".pdf":
+        if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
+            text = partition(file)
+            text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
+        elif file_extension == ".pdf":
             text = extract_text_from_pdf(file)
         else:  # For non-PDF text-based files
             with open(file, "r", encoding="utf-8", errors="ignore") as f:

diff --git a/test/test_files/example.docx b/test/test_files/example.docx
diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py
@@ -18,8 +18,15 @@
 else:
     skip = False
 import os
+import sys
 import pytest
 
+try:
+    from unstructured.partition.auto import partition
+
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False
 
 test_dir = os.path.join(os.path.dirname(__file__), "test_files")
 expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
@@ -47,7 +54,10 @@ def test_split_files_to_chunks(self):
         pdf_file_path = os.path.join(test_dir, "example.pdf")
         txt_file_path = os.path.join(test_dir, "example.txt")
         chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
-        assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
+        assert all(
+            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
+            for chunk in chunks
+        )
 
     def test_get_files_from_dir(self):
         files = get_files_from_dir(test_dir)
@@ -161,14 +171,17 @@ def custom_text_split_function(text):
         )
         results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
         assert (
-            results.get("documents")[0][0]
-            == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
+            "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
+            in results.get("documents")[0][0]
         )
 
     def test_retrieve_utils(self):
         client = chromadb.PersistentClient(path="/tmp/chromadb")
         create_vector_db_from_dir(
-            dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True
+            dir_path="./website/docs",
+            client=client,
+            collection_name="autogen-docs",
+            get_or_create=True,
         )
         results = query_vector_db(
             query_texts=[
@@ -182,6 +195,20 @@ def test_retrieve_utils(self):
         print(results["ids"][0])
         assert len(results["ids"][0]) == 4
 
+    @pytest.mark.skipif(
+        not HAS_UNSTRUCTURED,
+        reason="do not run if unstructured is not installed",
+    )
+    def test_unstructured(self):
+        pdf_file_path = os.path.join(test_dir, "example.pdf")
+        txt_file_path = os.path.join(test_dir, "example.txt")
+        word_file_path = os.path.join(test_dir, "example.docx")
+        chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
+        assert all(
+            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
+            for chunk in chunks
+        )
+
 
 if __name__ == "__main__":
     pytest.main()

diff --git a/website/blog/2023-10-18-RetrieveChat/index.mdx b/website/blog/2023-10-18-RetrieveChat/index.mdx
@@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
 pip install "pyautogen[retrievechat]"
 ```
 
+RetrieveChat can handle various types of documents. By default, it can process
+plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
+'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
+If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
+(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
+'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
+
+You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
+
 1. Import Agents
 ```python
 import autogen
@@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
 You can check out more example notebooks for RAG use cases:
 - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
 - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
+- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
diff --git a/website/docs/Installation.md b/website/docs/Installation.md
@@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
 - `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.
 
 ### Optional Dependencies
-* docker
+- #### docker
 
 For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
 
@@ -77,34 +77,51 @@ When running AutoGen out of a docker container, to use docker for code execution
 pip install docker
 ```
 
-* blendsearch
+- #### blendsearch
 
 `pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
 ```bash
 pip install "pyautogen[blendsearch]<0.2"
 ```
 
 Example notebooks:
-[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
+
+[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
+
 [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
 
-* retrievechat
+- #### retrievechat
 
 `pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
 ```bash
 pip install "pyautogen[retrievechat]<0.2"
 ```
 
+RetrieveChat can handle various types of documents. By default, it can process
+plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
+'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
+If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
+(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
+'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
+
+You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
+
 Example notebooks:
-[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
+
+[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
+
 [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
 
-* mathchat
+[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
+
+
+- #### mathchat
 
 `pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
 ```bash
 pip install "pyautogen[mathchat]<0.2"
 ```
 
 Example notebooks:
+
 [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)